Skip to content

Commit

Permalink
update docker image and bump DSE
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffra authored and Ubuntu committed Sep 10, 2020
1 parent 6bb5c69 commit b29229b
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 46 deletions.
2 changes: 1 addition & 1 deletion DeepSpeedExamples
Submodule DeepSpeedExamples updated 35 files
+19 −0 BingBertSquad/ckpt/bert-large-uncased-whole-word-masking-config.json
+340 −0 BingBertSquad/convert_bert_ckpt_to_deepspeed.py
+19 −0 BingBertSquad/deepspeed_onebitadam_bsz96_config.json
+0 −1 BingBertSquad/nvidia_run_squad_baseline.py
+98 −70 BingBertSquad/nvidia_run_squad_deepspeed.py
+59 −0 BingBertSquad/run_hf.sh
+1 −2 BingBertSquad/run_squad_deepspeed.sh
+58 −0 BingBertSquad/run_squad_deepspeed_onebitadam.sh
+58 −0 BingBertSquad/run_squad_mpi_onebitadam.sh
+13 −23 BingBertSquad/utils.py
+1 −0 CODEOWNERS
+4 −0 Megatron-LM/arguments.py
+29 −29 Megatron-LM/data_utils/corpora.py
+16 −6 Megatron-LM/pretrain_gpt2.py
+1 −1 Megatron-LM/scripts/ds_checkpoint_check.sh
+26 −0 Megatron-LM/scripts/ds_zero-offload_10B_config.json
+50 −0 Megatron-LM/scripts/ds_zero-offload_10B_pretrain_gpt2_model_parallel.sh
+24 −0 Megatron-LM/scripts/ds_zero-offload_config.json
+49 −0 Megatron-LM/scripts/ds_zero-offload_pretrain_gpt2_model_parallel.sh
+6 −11 Megatron-LM/scripts/ds_zero2_config.json
+1 −1 Megatron-LM/scripts/ds_zero2_pretrain_gpt2_model_parallel.sh
+59 −0 bing_bert/bert_base.json
+59 −0 bing_bert/bert_large.json
+24 −0 bing_bert/deepspeed_bsz4k_onebit_config_seq128.json
+10 −0 bing_bert/deepspeed_bsz64k_lamb_config_seq128.json
+22 −3 bing_bert/deepspeed_train.py
+25 −0 bing_bert/ds_sa_train_bert_bsz64k_seq128.sh
+23 −0 bing_bert/ds_train_bert_onebit_bsz4k_seq128.sh
+28 −0 bing_bert/mpi_train_bert_onebitadam_bsz4k_seq128.sh
+92 −12 bing_bert/nvidia/modelingpreln.py
+4 −0 bing_bert/utils.py
+47 −0 pipeline_parallelism/alexnet.py
+19 −0 pipeline_parallelism/ds_config.json
+3 −0 pipeline_parallelism/run.sh
+157 −0 pipeline_parallelism/train.py
2 changes: 1 addition & 1 deletion azure-pipelines-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ stages:
displayName: Build and Push
inputs:
command: buildAndPush
dockerfile: '$(Build.SourcesDirectory)/Dockerfile'
dockerfile: '$(Build.SourcesDirectory)/docker/Dockerfile'
repository: deepspeed/deepspeed
tags: |
$(tag)
Expand Down
136 changes: 92 additions & 44 deletions Dockerfile → docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ RUN mkdir -p ${STAGE_DIR}
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common \
openssh-client openssh-server \
pdsh curl sudo net-tools \
vim iputils-ping wget
#llvm-9-dev cmake
software-properties-common build-essential autotools-dev \
nfs-common pdsh \
cmake g++ gcc \
curl wget vim tmux emacs less unzip \
htop iftop iotop ca-certificates openssh-client openssh-server \
rsync iputils-ping net-tools sudo \
llvm-9-dev

##############################################################################
# Installation Latest Git
Expand All @@ -25,6 +27,43 @@ RUN add-apt-repository ppa:git-core/ppa -y && \
apt-get install -y git && \
git --version

##############################################################################
# Client Liveness & Uncomment Port 22 for SSH Daemon
##############################################################################
# Keep SSH client alive from server side
RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config

##############################################################################
# Mellanox OFED
##############################################################################
ENV MLNX_OFED_VERSION=4.6-1.0.1.1
RUN apt-get install -y libnuma-dev
RUN cd ${STAGE_DIR} && \
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
cd ${STAGE_DIR} && \
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*

##############################################################################
# nv_peer_mem
##############################################################################
ENV NV_PEER_MEM_VERSION=1.1
ENV NV_PEER_MEM_TAG=1.1-0
RUN mkdir -p ${STAGE_DIR} && \
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
cd ${STAGE_DIR}/nv_peer_memory && \
./build_module.sh && \
cd ${STAGE_DIR} && \
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
apt-get update && \
apt-get install -y dkms && \
dpkg-buildpackage -us -uc && \
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb

##############################################################################
# OPENMPI
##############################################################################
Expand Down Expand Up @@ -63,13 +102,57 @@ RUN apt-get install -y python3 python3-dev && \
# Print python an pip version
python -V && pip -V
RUN pip install pyyaml
RUN pip install ipython

##############################################################################
# TensorFlow
##############################################################################
ENV TENSORFLOW_VERSION=1.15.2
RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION}

##############################################################################
# Some Packages
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libsndfile-dev \
libcupti-dev \
libjpeg-dev \
libpng-dev \
screen
RUN pip install psutil \
yappi \
cffi \
ipdb \
pandas \
matplotlib \
py3nvml \
pyarrow \
graphviz \
astor \
boto3 \
tqdm \
sentencepiece \
msgpack \
requests \
pandas \
sphinx \
sphinx_rtd_theme \
scipy \
numpy \
sklearn \
scikit-learn \
nvidia-ml-py3 \
mpi4py \
cupy-cuda100

##############################################################################
## SSH daemon port inside container cannot conflict with host OS port
###############################################################################
ENV SSH_PORT=2222
RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config

##############################################################################
# PyTorch
##############################################################################
Expand All @@ -81,46 +164,11 @@ RUN pip install torchvision==${TORCHVISION_VERSION}
RUN pip install tensorboardX==${TENSORBOARDX_VERSION}

##############################################################################
# Temporary Installation Directory
##############################################################################
ENV STAGE_DIR=/tmp
RUN mkdir -p ${STAGE_DIR}

##############################################################################
# Mellanox OFED
##############################################################################
ENV MLNX_OFED_VERSION=4.6-1.0.1.1
RUN apt-get install -y libnuma-dev
RUN cd ${STAGE_DIR} && \
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
cd ${STAGE_DIR} && \
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*

##############################################################################
# nv_peer_mem
# PyYAML build issue
# https://stackoverflow.com/a/53926898
##############################################################################
ENV NV_PEER_MEM_VERSION=1.1
ENV NV_PEER_MEM_TAG=1.1-0
RUN mkdir -p ${STAGE_DIR} && \
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
cd ${STAGE_DIR}/nv_peer_memory && \
./build_module.sh && \
cd ${STAGE_DIR} && \
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
apt-get update && \
apt-get install -y dkms && \
dpkg-buildpackage -us -uc && \
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb

##############################################################################
## SSH daemon port inside container cannot conflict with host OS port
###############################################################################
ENV SSH_PORT=2222
RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
rm -rf /usr/lib/python3/dist-packages/PyYAML-*

##############################################################################
## Add deepspeed user
Expand Down

0 comments on commit b29229b

Please sign in to comment.