diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index ebb3f97895f06..981d96336b38b 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -314,6 +314,18 @@ void AddOperationInput(MILSpec::Operation& op, std::string_view input_name, std:
   (*op.mutable_inputs())[input_name] = std::move(arg);
 }
 
+void AddIntermediateOperationOutput(COREML_SPEC::MILSpec::Operation& op, const std::string& output_name,
+                                    int32_t element_type, std::optional<gsl::span<const int64_t>> shape) {
+  auto& outputs = *op.mutable_outputs();
+  auto& output_arg = *outputs.Add();
+  output_arg.set_name(output_name);
+
+  MILSpec::ValueType& value = *output_arg.mutable_type();
+  MILSpec::TensorType& tensor_type = *value.mutable_tensortype();
+
+  SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(element_type), shape, /*convert_scalar*/ true);
+}
+
 void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output,
                         std::optional<int32_t> override_element_type) {
   auto& outputs = *op.mutable_outputs();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index f012e6af0d718..7ff1e6b8dbf26 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -130,6 +130,17 @@ void AddOperationInput(COREML_SPEC::MILSpec::Operation& op,
                        std::string_view input_name, std::string_view value_name);
 
 /// <summary>
+/// Add an output to a MILSpec::Operation for an intermediate operation when the implementation is composed of
+/// multiple MLProgram operations. In this case we don't have a NodeArg for the output.
+/// </summary>
+/// <param name="op">Operation to update.</param>
+/// <param name="output_name">Name of the intermediate output. Create using ModelBuilder::GetUniqueName.</param>
+/// <param name="element_type">onnx::TensorProto_DataType element type of the output.
+///   int32_t as that is what TensorShapeProto uses to store the value.</param>
+/// <param name="shape">Shape of the output if known.</param>
+void AddIntermediateOperationOutput(COREML_SPEC::MILSpec::Operation& op, const std::string& output_name,
+                                    int32_t element_type, std::optional<gsl::span<const int64_t>> shape);
+/// <summary>
 /// Add an output to a MILSpec::Operation. Name, data type and shape are used from the NodeArg.
 /// </summary>
 /// <param name="op">Operation to update.</param>
diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
index 1eba312b2577b..bec2461ffbc52 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
@@ -4,6 +4,7 @@
 #include "core/common/safeint.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -18,52 +19,133 @@ class DepthToSpaceOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                     const Node& node,
-                                                    const logging::Logger& /* logger */) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
+                                                    [[maybe_unused]] const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& output_defs = node.OutputDefs();
   const auto& input_name = input_defs[0]->Name();
-  const auto& output_name = output_defs[0]->Name();
 
-  uint64_t blocksize = SafeInt<uint64_t>(node.GetAttributes().at("blocksize").i());
+  NodeAttrHelper helper(node);
+  int64_t blocksize = *helper.GetInt64("blocksize");  // required attribute
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;  // NOLINT
+
+    const auto mode = helper.Get("mode", "DCR");
+
+    if (mode == "DCR") {
+      // DCR is directly supported
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.depth_to_space
+      // Validated with depth_to_space.py.
+      auto op = model_builder.CreateOperation(node, "depth_to_space");
+      AddOperationInput(*op, "x", input_name);
+      AddOperationInput(*op, "block_size", model_builder.AddScalarConstant(op->type(), "blocksize", blocksize));
+      AddOperationOutput(*op, *output_defs[0]);
+      model_builder.AddOperation(std::move(op));
+    } else {
+      // CRD is manual. there may be a perf cost from the Reshape's (typically that happens on CPU) but if the input
+      // is a fixed size hopefully CoreML is smart enough to handle that aspect during model compilation instead
+      // of execution.
+
+      // https://github.com/onnx/onnx/blob/main/docs/Operators.md#depthtospace
+      // b, c, h, w = x.shape
+      // tmp = np.reshape(x, [b, c // (blocksize ** 2), blocksize, blocksize, h, w])
+      // tmp = np.transpose(tmp, [0, 1, 4, 2, 5, 3])
+      // y = np.reshape(tmp, [b, c // (blocksize ** 2), h * blocksize, w * blocksize])
+      //
+      // CoreML has a 5D limit, so we merge the batch dim into the channel dim as that doesn't change the data
+      // movement.
+      // First reshape is to [b * c // (blocksize ** 2), blocksize, blocksize, h, w]
+      // Transpose is to [0, 3, 1, 4, 2]
+
+      // we checked shape was static in IsOpSupportedImpl so this should never fail
+      std::vector<int64_t> input_shape;
+      ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Failed to get input shape");
+      const int32_t elem_type = static_cast<int32_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+      // reshape to [b * c // (blocksize ** 2), blocksize, blocksize, h, w]
+      auto reshape1 = model_builder.CreateOperation(node, "reshape", "pre");
+      std::vector<int64_t> shape1 = {input_shape[0] * input_shape[1] / (blocksize * blocksize),
+                                     blocksize, blocksize, input_shape[2], input_shape[3]};
+      AddOperationInput(*reshape1, "x", input_name);
+      AddOperationInput(*reshape1, "shape", model_builder.AddConstant(reshape1->type(), "shape", shape1));
+      const auto& reshape1_output = model_builder.GetUniqueName(node, "reshape1");
+      AddIntermediateOperationOutput(*reshape1, reshape1_output, elem_type, shape1);
+
+      // transpose to [0, 3, 1, 4, 2]
+      auto transpose = model_builder.CreateOperation(node, "transpose");
+      std::vector<int64_t> perm = {0, 3, 1, 4, 2};
+      std::vector<int64_t> shape2 = {shape1[0], shape1[3], shape1[1], shape1[4], shape1[2]};
+      AddOperationInput(*transpose, "x", reshape1_output);
+      AddOperationInput(*transpose, "perm", model_builder.AddConstant(transpose->type(), "perm", perm));
+      const auto& transpose_output = model_builder.GetUniqueName(node, "transpose");
+      AddIntermediateOperationOutput(*transpose, transpose_output, elem_type, shape2);
+
+      // reshape to [b, c // (blocksize ** 2), h * blocksize, w * blocksize]
+      auto reshape2 = model_builder.CreateOperation(node, "reshape", "post");
+      std::vector<int64_t> shape3 = {input_shape[0],
+                                     input_shape[1] / (blocksize * blocksize),
+                                     input_shape[2] * blocksize,
+                                     input_shape[3] * blocksize};
+      AddOperationInput(*reshape2, "x", transpose_output);
+      AddOperationInput(*reshape2, "shape", model_builder.AddConstant(reshape2->type(), "shape", shape3));
+
+      AddOperationOutput(*reshape2, *output_defs[0]);
+
+      model_builder.AddOperation(std::move(reshape1));
+      model_builder.AddOperation(std::move(transpose));
+      model_builder.AddOperation(std::move(reshape2));
+    }
+  } else  // NOLINT
+#endif    // if defined(COREML_ENABLE_MLPROGRAM)
+  {
+    const auto& output_name = output_defs[0]->Name();
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
-  auto* coreml_depthtospace = layer->mutable_reorganizedata();
-  coreml_depthtospace->set_blocksize(blocksize);
-  coreml_depthtospace->set_mode(CoreML::Specification::ReorganizeDataLayerParams_ReorganizationType::
-                                    ReorganizeDataLayerParams_ReorganizationType_DEPTH_TO_SPACE);
+    auto* coreml_depthtospace = layer->mutable_reorganizedata();
+    coreml_depthtospace->set_blocksize(static_cast<uint64_t>(blocksize));
+    coreml_depthtospace->set_mode(CoreML::Specification::ReorganizeDataLayerParams_ReorganizationType::
+                                      ReorganizeDataLayerParams_ReorganizationType_DEPTH_TO_SPACE);
 
-  *layer->mutable_input()->Add() = input_name;
-  *layer->mutable_output()->Add() = output_name;
+    *layer->mutable_input()->Add() = input_name;
+    *layer->mutable_output()->Add() = output_name;
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
-bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                               const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
   std::vector<int64_t> input_shape;
   if (!GetShape(*input_defs[0], input_shape, logger)) {
+    LOGS(logger, VERBOSE) << "DepthToSpace: no input shape";
     return false;
   }
 
-  const auto input_rank = input_shape.size();
-  if (input_rank < 4) {
-    LOGS(logger, VERBOSE) << "DepthToSpace does not support input shape of " << input_rank << "d shape.";
-  }
+  // ONNX and CoreML both require 4D input so no need to check the shape here.
 
   NodeAttrHelper helper(node);
-  if (node.SinceVersion() >= 11) {
-    // For now, only DCR mode DepthToSpace is supported
-    const auto mode = helper.Get("mode", "DCR");
+  const auto mode = helper.Get("mode", "DCR");
+
+  if (input_params.create_mlprogram) {
+    if (mode == "CRD" && !IsStaticShape(input_shape)) {
+      // we need to manually implement the logic with a Reshape, so we need to know the shape to do that
+      LOGS(logger, VERBOSE) << "DepthToSpace: CRD mode requires static shape";
+      return false;
+    }
+  } else {
     if (mode != "DCR") {
-      LOGS(logger, VERBOSE) << "The mode: " << mode << "of DepthToSpace is not supported in CoreML EP for now.";
+      LOGS(logger, VERBOSE) << "DepthToSpace: " << mode << " mode is not supported";
       return false;
     }
   }
diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
index 5222380d9ca56..a0c1d675f506f 100644
--- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
@@ -373,5 +373,36 @@ TEST(TensorOpTest, DepthToSpaceTest_5) {
   test.Run();
 }
 
+TEST(TensorOpTest, DepthToSpaceTest_CRD_Batched) {
+  OpTester test("DepthToSpace", 11);  // create an opset 11 model with attribute present = "CRD" mode
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "CRD");
+
+  constexpr int64_t N = 2, C = 4, H = 2, W = 3;
+  std::vector<float> X = {0., 1., 2.,
+                          3., 4., 5.,
+                          9., 10., 11.,
+                          12., 13., 14.,
+                          18., 19., 20.,
+                          21., 22., 23.,
+                          27., 28., 29.,
+                          30., 31., 32.};
+
+  // append same data but in reverse order so we can tell if the batch output is wrong
+  X.insert(X.end(), X.rbegin(), X.rend());
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0., 9., 1., 10., 2., 11.,
+                               18., 27., 19., 28., 20., 29.,
+                               3., 12., 4., 13., 5., 14.,
+                               21., 30., 22., 31., 23., 32.};
+  result.insert(result.end(), result.rbegin(), result.rend());
+
+  test.AddOutput<float>("output", {2, 1, 4, 6}, result);
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index c33184686c932..322e4eed5f9c8 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -7,6 +7,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:AveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:Clip||
 |ai.onnx:Conv|Only 1D/2D Conv is supported.<br/>Bias if provided must be constant.|
+|ai.onnx.DepthToSpace|If 'mode' is 'CRD' the input must have a fixed shape.|
 |ai.onnx:Div||
 |ai.onnx:Gemm|Input B must be constant.|
 |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|