microsoft · wschin · Oct 28, 2023 · Oct 26, 2023 · Oct 27, 2023 · Oct 26, 2023
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -39,6 +39,7 @@
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_matmul.cc"
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_slice.cc"
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_reshape.cc"
+        "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_expand.cc"
       )
     endif()
     # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
@@ -108,6 +108,7 @@ if (NOT onnxruntime_USE_NCCL)
   list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc")
   list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
   list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
+  list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
 endif()
 
 set(provider_excluded_files

diff --git a/onnxruntime/contrib_ops/cuda/collective/distributed_expand.cc b/onnxruntime/contrib_ops/cuda/collective/distributed_expand.cc
@@ -0,0 +1,110 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Distributed computation.
+#include "distributed_expand.h"
+#include "sharding.h"
+#include "sharding_spec.h"
+#include "nccl_kernels.h"
+#include "mpi_include.h"
+
+// ORT system.
+#include "core/providers/cuda/tensor/expand.h"
+
+// std C++.
+#include <iostream>
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#if defined(ORT_USE_NCCL)
+
+template <typename T>
+DistributedExpand<T>::DistributedExpand(const OpKernelInfo& info) : DistributedKernel(info) {}
+
+template <typename T>
+Status DistributedExpand<T>::ComputeInternal(OpKernelContext* context) const {
+  ORT_ENFORCE(context != nullptr);
+  // Assumptions.
+  //  - Shape is not sharded.
+  // Algorithm.
+  //  - Compute logical output shape.
+  //  - Compute local output shape.
+  //  - Expand from local input to local output.
+
+  auto input_tensor = context->Input<Tensor>(0);
+  auto shape_tensor = context->Input<Tensor>(1);
+  const auto& input_sharding_spec = input_shard_specs_.at(0);
+  const auto& shape_sharding_spec = input_shard_specs_.at(1);
+  const auto& output_sharding_spec = output_shard_specs_.at(0);
+
+  ORT_ENFORCE(shape_sharding_spec.HasNoShard(),
+              "It's not worth to shard Shape tensor. "
+              "If sharding shape is needed, please submit a feature request.");
+  // Compute logical input shape.
+  const auto original_input_shape = ComputeOriginShape(input_tensor->Shape(), input_sharding_spec);
+
+  // Compute logical output shape.
+  // This `shape_tensor` stores the logical output shape.
+  const auto* p_shape = shape_tensor->Data<int64_t>();
+  TensorShapeVector original_output_dims{p_shape, p_shape + shape_tensor->Shape().Size()};
+  TensorShape original_output_shape(original_output_dims);
+  ORT_ENFORCE(
+      onnxruntime::cuda::ComputeOutputShape(
+          Node().Name(),
+          original_input_shape,
+          original_output_dims, original_output_shape)
+          .IsOK());
+
+  // Compute local output shape.
+  const auto local_output_shape = ComputeShardShape(original_output_shape, output_sharding_spec);
+
+  auto output_tensor = context->Output(0, local_output_shape);
+
+  return FuncExpand(
+      this,
+      context,
+      input_tensor,
+      shape_tensor,
+      output_tensor);
+}
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    DistributedExpand,
+    kMSDomain,
+    1,
+    int64_t,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<int64_t>())
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    DistributedExpand<int64_t>);
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    DistributedExpand,
+    kMSDomain,
+    1,
+    float,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    DistributedExpand<float>);
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    DistributedExpand,
+    kMSDomain,
+    1,
+    MLFloat16,
+    kCudaExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>())
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    DistributedExpand<MLFloat16>);
+
+#endif
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/collective/distributed_expand.h b/onnxruntime/contrib_ops/cuda/collective/distributed_expand.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "sharding_spec.h"
+#include "sharding.h"
+#include "core/providers/cuda/cuda_kernel.h"
+
+#include <algorithm>
+#include <tuple>
+#include <optional>
+#include <string>
+#include <nccl.h>
+#include <sstream>
+
+#pragma once
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#if defined(ORT_USE_NCCL)
+
+template <typename T>
+class DistributedExpand final : public DistributedKernel {
+ public:
+  explicit DistributedExpand(const OpKernelInfo& info);
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+#endif
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -170,6 +170,10 @@
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, int64_t, DistributedReshape);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedReshape);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedReshape);
+
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, int64_t, DistributedExpand);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedExpand);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedExpand);
 #endif
 
 template <>
@@ -344,6 +348,10 @@
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, int64_t, DistributedReshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedReshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedReshape)>,
+
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, int64_t, DistributedExpand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DistributedExpand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedExpand)>,
 #endif
 
   };

diff --git a/onnxruntime/core/graph/contrib_ops/collective_defs.cc b/onnxruntime/core/graph/contrib_ops/collective_defs.cc
@@ -236,6 +236,43 @@ void RegisterCollectiveOps() {
           OpSchema::NonDifferentiable)
       .Output(0, "reshaped", "Reshaped data.", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
       .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensor types.");
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(DistributedExpand)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .Attr("input_device_mesh_elements",
+            "device_mesh_elements[i] defines the device mesh's value for the i-th input. "
+            "E.g., device_mesh_elements=[\"[0, 1]\", \"[0, 1]\"] means the 1st and the 2nd "
+            " inputs are stored on the 0-th and the 1st devices, respectively.",
+            AttributeProto::STRINGS)
+      .Attr("input_device_mesh_shapes",
+            "device_mesh_shape[i] defines the device mesh's shape for the i-th input.",
+            AttributeProto::STRINGS)
+      .Attr("input_shard_specs",
+            "The sharding spec of inputs. "
+            "E.g., if input_shard_specs[i] is \"RRR\", the i-th input is a unsharded 3-D tensor.",
+            AttributeProto::STRINGS)
+      .Attr("output_device_mesh_elements",
+            "Similar to input_device_mesh_elments but for outputs.",
+            AttributeProto::STRINGS)
+      .Attr("output_device_mesh_shapes",
+            "Similar to input_device_mesh_shapes but for outputs.",
+            AttributeProto::STRINGS)
+      .Attr("output_shard_specs",
+            "Similar to input_shard_specs but for outputs.",
+            AttributeProto::STRINGS)
+      .Input(0, "input", "Input tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+      .Input(
+          1,
+          "shape",
+          "A 1-D tensor indicates the shape you want to expand to, following the broadcast rule",
+          "tensor(int64)",
+          OpSchema::Single,
+          true,
+          1,
+          OpSchema::NonDifferentiable)
+      .Output(0, "output", "Output tensor", "T", OpSchema::Single, true, 1, OpSchema::Differentiable)
+      .TypeConstraint("T", OpSchema::all_tensor_types_ir4(), "Constrain input and output types to all tensors.");
 }
 
 }  // namespace contrib

diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc
@@ -142,6 +142,86 @@
       input_strides);
 }
 
+Status FuncExpand(
+    const CudaKernel* cuda_kernel,
+    OpKernelContext* ctx,
+    const Tensor* input_data_tensor,
+    const Tensor* /*input_shape_tensor*/,
+    Tensor* output_tensor) {
+  TensorShape output_shape = output_tensor->Shape();
+
+#ifdef ENABLE_STRIDED_TENSORS
+  // Strided output.
+  if (input_data_tensor->DataRaw() == output_tensor->DataRaw()) {
+    gsl::span<const int64_t> input_strides = input_data_tensor->Strides();
+    TensorShapeVector output_strides =
+        ComputeOutputStrides(input_data_tensor->Shape(), input_strides, output_shape);
+    output_tensor->SetShapeAndStrides(output_shape, output_strides);
+    return Status::OK();
+  }
+#endif
+
+  auto output_dims = output_shape.AsShapeVector();
+  auto input_dims = input_data_tensor->Shape().AsShapeVector();
+
+  CalcEffectiveDims(input_dims, output_dims);
+  int rank = gsl::narrow_cast<int>(output_dims.size());
+
+  TensorPitches original_input_strides(input_dims);
+  TensorPitches original_output_strides(output_dims);
+
+  TArray<int64_t> input_strides(rank);
+  for (auto i = 0; i < rank; i++) {
+    input_strides[i] = input_dims[i] == 1 ? 0 : original_input_strides[i];
+  }
+
+  TArray<fast_divmod> output_strides(rank);
+  for (auto i = 0; i < rank; i++) {
+    output_strides[i] = fast_divmod(static_cast<int>(original_output_strides[i]));
+  }
+
+  return ExpandImpl(
+      cuda_kernel->Stream(ctx),
+      input_data_tensor->DataType()->Size(),
+      gsl::narrow_cast<int>(output_shape.Size()),
+      gsl::narrow_cast<int>(input_data_tensor->Shape().Size()),
+      input_data_tensor->DataRaw(),
+      output_tensor->MutableDataRaw(),
+      output_strides,
+      input_strides);
+}
+
+std::unique_ptr<Tensor> FuncExpand(
+    const CudaKernel* cuda_kernel,
+    OpKernelContext* ctx,
+    const Tensor* input_data_tensor,
+    const Tensor* input_shape_tensor) {
+  // new shape to be expanded to
+  const auto* p_shape = input_shape_tensor->Data<int64_t>();
+  TensorShapeVector output_dims{p_shape, p_shape + input_shape_tensor->Shape().Size()};
+  TensorShape output_shape(output_dims);
+
+  ORT_ENFORCE(
+      ComputeOutputShape(
+          cuda_kernel->Node().Name(),
+          input_data_tensor->Shape(),
+          output_dims, output_shape)
+          .IsOK());
+
+  // Pre-allocate output.
+  AllocatorPtr alloc;
+  ORT_ENFORCE(ctx->GetTempSpaceAllocator(&alloc).IsOK());
+  auto output_tensor = Tensor::Create(input_data_tensor->DataType(), output_shape, alloc);
+
+  // Only assign output values when output tensor is non-empty
+  // because empty tensor doesn't own any data.
+  if (output_shape.Size() > 0) {
+    ORT_ENFORCE(FuncExpand(cuda_kernel, ctx, input_data_tensor, input_shape_tensor, output_tensor.get()).IsOK());
+  }
+
+  return output_tensor;
+}
+
 #ifdef ENABLE_STRIDED_TENSORS
 #define CREATE_EXPAND_KERNEL_DEF (*KernelDefBuilder::Create()).MayStridedOutput(0, 0)
 #else

diff --git a/onnxruntime/core/providers/cuda/tensor/expand.h b/onnxruntime/core/providers/cuda/tensor/expand.h
@@ -20,5 +20,18 @@
     const TensorShape& rhs_shape,
     TensorShape& out_shape);
 
+Status FuncExpand(
+    const CudaKernel* cuda_kernel,
+    OpKernelContext* ctx,
+    const Tensor* input_data_tensor,
+    const Tensor* /*input_shape_tensor*/,
+    Tensor* output_tensor);
+
+std::unique_ptr<Tensor> FuncExpand(
+    const CudaKernel* cuda_kernel,
+    OpKernelContext* ctx,
+    const Tensor* input_data_tensor,
+    const Tensor* input_shape_tensor);
+
 }  // namespace cuda
 }  // namespace onnxruntime