Refactoring FSDP. (#1586)

* refactor fsdp * add trainer * remove hidden layers * update dockerfile --------- Co-authored-by: Adam Louly <[email protected]@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net> Co-authored-by: JingyaHuang <[email protected]>
huggingface · Dec 26, 2023 · 5017d06 · 5017d06
1 parent 1a807fc
commit 5017d06
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 4 deletions.
diff --git a/...raining/docker/Dockerfile-ort1.16.1-cu118 → ...raining/docker/Dockerfile-ort1.16.3-cu118 b/...raining/docker/Dockerfile-ort1.16.1-cu118 → ...raining/docker/Dockerfile-ort1.16.3-cu118
@@ -65,12 +65,15 @@ RUN $PYTHON_EXE -m pip install onnx ninja
 RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION}
 
 # ORT Module
-RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
+RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.3 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
 RUN $PYTHON_EXE -m pip install torch-ort
 ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX"
 RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2
 RUN $PYTHON_EXE -m torch_ort.configure
 
+# https://github.com/vllm-project/vllm/issues/1726
+RUN pip uninstall nvidia-nccl-cu12 -y
+
 WORKDIR .
 
 CMD ["/bin/bash"]
diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
@@ -455,7 +455,7 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
 
         # Wrap the model with `ORTModule`
         logger.info("Wrap ORTModule for ONNX Runtime training.")
@@ -883,7 +883,7 @@ def _wrap_model(self, model, training=True, dataloader=None):
             return model
 
         # Distributed training using PyTorch FSDP
-        if self.fsdp is not None:
+        if self.is_fsdp_xla_enabled:
             try:
                 from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
                 from torch_xla.distributed.fsdp import checkpoint_module

diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer
@@ -65,12 +65,15 @@ RUN $PYTHON_EXE -m pip install onnx ninja
 RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION}
 
 # ORT Module
-RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
+RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.3 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
 RUN $PYTHON_EXE -m pip install torch-ort
 ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX"
 RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2
 RUN $PYTHON_EXE -m torch_ort.configure
 
+# https://github.com/vllm-project/vllm/issues/1726
+RUN pip uninstall nvidia-nccl-cu12 -y
+
 # Install Optimum
 COPY . /workspace/optimum
 RUN pip install /workspace/optimum[tests]