diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake
index c9f35e5337f9b..8f3b1828e1c61 100644
--- a/cmake/onnxruntime_providers_coreml.cmake
+++ b/cmake/onnxruntime_providers_coreml.cmake
@@ -111,7 +111,7 @@ if(_enable_ML_PROGRAM)
   file(GLOB
     onnxruntime_providers_coreml_modelpackage_cc_srcs CONFIGURE_DEPENDS
     "${coremltools_SOURCE_DIR}/modelpackage/src/ModelPackage.?pp"
-    "${coremltools_SOURCE_DIR}/modelpackage/src/Utils/JsonMap.?pp"
+    "${coremltools_SOURCE_DIR}/modelpackage/src/utils/JsonMap.?pp"
   )
 
   set(coremltools_srcs
diff --git a/onnxruntime/core/providers/coreml/builders/coreml_spec.h b/onnxruntime/core/providers/coreml/builders/coreml_spec.h
index c9adba9e579d0..9448f1167990e 100644
--- a/onnxruntime/core/providers/coreml/builders/coreml_spec.h
+++ b/onnxruntime/core/providers/coreml/builders/coreml_spec.h
@@ -17,14 +17,19 @@
 #ifdef HAS_SHORTEN_64_TO_32
 #pragma GCC diagnostic ignored "-Wshorten-64-to-32"
 #endif
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4244)  // conversion from long to int
 #endif
 
 // Model.pb.h is generated in the build output directory from the CoreML protobuf files in
-// onnxruntime/core/providers/coreml/coremltools/mlmodel/format
+// <build output directory>/_deps/coremltools-src/mlmodel/format
 #include "coreml_proto/Model.pb.h"
 
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
 #endif
 
 namespace COREML_SPEC = CoreML::Specification;
diff --git a/onnxruntime/core/providers/coreml/builders/helper.cc b/onnxruntime/core/providers/coreml/builders/helper.cc
index bc3ba4432e66d..b8ebbd05a2a20 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.cc
+++ b/onnxruntime/core/providers/coreml/builders/helper.cc
@@ -85,9 +85,15 @@ bool IsInputSupported(const Node& node, const NodeArg& input,
     }
 
     if (dim == 0) {
-      LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
-                            << ", shape: " << Shape2String(shape);
-      return false;
+      if (node.OpType() == "Resize" && &input == node.InputDefs()[1]) {
+        // one special case. Resize 'roi' input was originally a required input but is rarely used.
+        // ROI is not supported in the CoreML implementation so we will ignore the value, but is often added
+        // (at least in the unit tests) as an initializer with shape {0}.
+      } else {
+        LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
+                              << ", shape: " << Shape2String(shape);
+        return false;
+      }
     }
   }
 
@@ -125,7 +131,7 @@ std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewe
 
 bool CheckIsConstantInitializer(const NodeArg& node_arg, const GraphViewer& graph_viewer,
                                 const logging::Logger& logger, std::string_view input_description) {
-  if (graph_viewer.GetConstantInitializer(node_arg.Name(), true) == nullptr) {
+  if (graph_viewer.GetConstantInitializer(node_arg.Name()) == nullptr) {
     LOGS(logger, VERBOSE) << input_description << " (NodeArg name: '" << node_arg.Name()
                           << "') is not a constant initializer tensor";
     return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 2570e6d88ae0d..83a572f4b60fa 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -83,9 +83,14 @@ bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputPar
 }
 
 /* static */
-bool BaseOpBuilder::IsInput0Supported(const Node& node, const OpBuilderInputParams& /*input_params*/,
-                                      const logging::Logger& logger) {
-  const auto& input = *node.InputDefs()[0];
+bool BaseOpBuilder::IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& /*input_params*/,
+                                 const logging::Logger& logger) {
+  if (idx >= node.InputDefs().size()) {
+    LOGS(logger, VERBOSE) << "Input index [" << idx << "] is out of range";
+    return false;
+  }
+
+  const auto& input = *node.InputDefs()[idx];
 
   int32_t input_type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
 
@@ -102,7 +107,7 @@ bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInpu
                                            const logging::Logger& logger) const {
   // We only check the type of input 0 by default
   // specific op builder can override this
-  return IsInput0Supported(node, input_params, logger);
+  return IsInputFloat(node, 0, input_params, logger);
 }
 
 bool BaseOpBuilder::HasSupportedOpSet(const Node& node, const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index 06c4dd94ea30d..63f0b813d654c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -28,9 +28,9 @@ class BaseOpBuilder : public IOpBuilder {
   void AddInitializersToSkip(ModelBuilder& /*model_builder*/, const Node& /*node*/) const override {}
 
  protected:
-  // check if the first input's data type is supported.
-  static bool IsInput0Supported(const Node& node, const OpBuilderInputParams& input_params,
-                                const logging::Logger& logger);
+  // currently we only support float
+  static bool IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
+                           const logging::Logger& logger);
 
  private:
   virtual bool IsOpSupportedImpl(const Node& /*node*/, const OpBuilderInputParams& /*input_params*/,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 6074fba1433d9..fb8e07633621f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -5,6 +5,7 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
@@ -19,6 +20,8 @@ class BinaryOpBuilder : public BaseOpBuilder {
 
   bool HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
                               const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
@@ -57,38 +60,72 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
-  if (op_type == "Add") {
-    // original mutable_add() has limited broadcasting support
-    // updated to use CoreML::AddBroadcastableLayerParams which has more general broadcasting support
-    if (CheckIfBothInputShapesMatch(node, logger)) {
-      layer->mutable_add();
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_binary
+    std::string_view coreml_op_type;
+    if (op_type == "Add") {
+      coreml_op_type = "add";
+    } else if (op_type == "Mul") {
+      coreml_op_type = "mul";
+    } else if (op_type == "Sub") {
+      coreml_op_type = "sub";
+    } else if (op_type == "Div") {
+      // we only support fp32 currently. when we add support for integers we need to check the type and use
+      // "floor_div" or "real_div" accordingly
+      coreml_op_type = "real_div";
+    } else if (op_type == "Pow") {
+      coreml_op_type = "pow";
     } else {
-      layer->mutable_addbroadcastable();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "BinaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
     }
-  } else if (op_type == "Mul") {
-    if (CheckIfBothInputShapesMatch(node, logger)) {
-      layer->mutable_multiply();
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    AddOperationInput(*op, "y", input_defs[1]->Name());
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // defined (COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    if (op_type == "Add") {
+      // original mutable_add() has limited broadcasting support
+      // updated to use CoreML::AddBroadcastableLayerParams which has more general broadcasting support
+      if (CheckIfBothInputShapesMatch(node, logger)) {
+        layer->mutable_add();
+      } else {
+        layer->mutable_addbroadcastable();
+      }
+    } else if (op_type == "Mul") {
+      if (CheckIfBothInputShapesMatch(node, logger)) {
+        layer->mutable_multiply();
+      } else {
+        layer->mutable_multiplybroadcastable();
+      }
+    } else if (op_type == "Sub") {
+      layer->mutable_subtractbroadcastable();
+    } else if (op_type == "Div") {
+      layer->mutable_dividebroadcastable();
+    } else if (op_type == "Pow") {
+      layer->mutable_powbroadcastable();
     } else {
-      layer->mutable_multiplybroadcastable();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "BinaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
     }
-  } else if (op_type == "Sub") {
-    layer->mutable_subtractbroadcastable();
-  } else if (op_type == "Div") {
-    layer->mutable_dividebroadcastable();
-  } else if (op_type == "Pow") {
-    layer->mutable_powbroadcastable();
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "BinaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_input()->Add() = input_defs[1]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[1]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
@@ -99,25 +136,11 @@ int BinaryOpBuilder::GetMinSupportedOpSet(const Node& /* node */) const {
 
 bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
                                              const logging::Logger& logger) const {
-  if (node.OpType() != "Pow") {
-    return IsInput0Supported(node, input_params, logger);
-  }
-
-  const auto& input_1 = *node.InputDefs()[0];
-  const auto& input_2 = *node.InputDefs()[1];
-
-  // Pow we only support both inputs as fp32 for now
-  int32_t input_type_1;
-  int32_t input_type_2;
-  if (!GetType(input_1, input_type_1, logger) ||
-      !GetType(input_2, input_type_2, logger)) {
-    return false;
-  }
-
-  if (input_type_1 != ONNX_NAMESPACE::TensorProto_DataType_FLOAT || input_type_1 != input_type_2) {
-    LOGS(logger, VERBOSE) << "Pow only supports fp32 inputs, actual input type"
-                          << ", Input type 1: " << input_type_1
-                          << ", Input type 2: " << input_type_2;
+  // Add/Sub/Mul/Div spec says inputs must be of the same type.
+  // Pow spec says inputs can be different types.
+  // We only support float for all of these inputs.
+  if (!IsInputFloat(node, 0, input_params, logger) ||
+      ((node.OpType() == "Pow") && !IsInputFloat(node, 1, input_params, logger))) {
     return false;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 710f596b2a562..cbea969904ed5 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -7,6 +7,7 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/coreml/builders/coreml_spec.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/optimizer/initializer.h"
 
@@ -132,6 +133,7 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   CreateCoreMLWeightConvertingDataToFloats(weight, data);
 }
 
+#if defined(COREML_ENABLE_MLPROGRAM)
 //
 // ML Program Utils
 //
@@ -309,5 +311,71 @@ void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& outp
                     output.Shape());
 }
 
+void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
+                       const NodeAttrHelper& helper, int num_spatial_dims) {
+  AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
+
+  // pad type (string)
+  //   valid - no pads  (ONNX auto_pad VALID)
+  //   custom - pads input  (ONNX NOTSET)
+  //   same - inferred to be `d_out[i] = ceil(d_in[i] / strides[i])`  (assuming == ONNX SAME_UPPER)
+  //   same_lower - as per same but any extra rows/cols are added at top/left if padding is odd (ONNX SAME_LOWER)
+  //
+  // TODO: See if we want to update HandleAutoPad to support 1D (and 3D) so we can infer if an autopad value
+  //       can be used. TBD if that provides any performance benefit with ML Program though as CoreML could
+  //       potentially do that same optimization internally.
+  switch (auto_pad_type) {
+    case AutoPadType::NOTSET: {
+      // use `pads` attribute.
+      auto onnx_pads = helper.GetInt64s("pads");  // 'pads' are used if auto_pad is NOTSET
+      if (onnx_pads) {
+        AddOperationInput(op, "pad_type",
+                          model_builder.AddScalarConstant(op_type, "pad_type", std::string("custom")));
+
+        // need to re-order from x1_start, x2_start..., x1_end, x2_end... to
+        // x1_start, x1_end, x2_start, x2_end,...
+        size_t num_pads = onnx_pads->size();
+        size_t num_dims = num_pads / 2;
+        std::vector<int64_t> reordered_pads(num_pads, 0);
+        for (size_t i = 0; i < num_pads; ++i) {
+          auto cur_dim = i % num_dims;
+          if (i < num_dims) {  // start values
+            reordered_pads[cur_dim * 2] = (*onnx_pads)[i];
+          } else {  // end values
+            reordered_pads[cur_dim * 2 + 1] = (*onnx_pads)[i];
+          }
+        }
+
+        AddOperationInput(op, "pad", model_builder.AddConstant(op_type, "pad", reordered_pads));
+
+        break;
+      }
+
+      // fall through if explicit pads were not provided as the default value for `pads` is all zeros,
+      // which is the same as 'valid' padding.
+      [[fallthrough]];
+    }
+    case AutoPadType::VALID:
+      AddOperationInput(op, "pad_type",
+                        model_builder.AddScalarConstant(op_type, "pad_type", std::string("valid")));
+
+      break;
+    case AutoPadType::SAME_UPPER:
+    case AutoPadType::SAME_LOWER: {
+      const auto pad_type = (auto_pad_type == AutoPadType::SAME_UPPER ? "same" : "same_lower");
+      AddOperationInput(op, "pad_type",
+                        model_builder.AddScalarConstant(op_type, "pad_type", std::string(pad_type)));
+
+      // despite what the spec says, a 'pad' input seems to be required.
+      // https://github.com/apple/coremltools/issues/2127
+      // Provide the default value as that's what coremltools does for conv/avg_pool/max_pool.
+      std::vector<int64_t> ignored_pads(num_spatial_dims * 2, 0);
+      AddOperationInput(op, "pad", model_builder.AddConstant(op_type, "pad", ignored_pads));
+
+      break;
+    }
+  }
+}
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index 8126f0c126914..2804589065631 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -11,13 +11,15 @@
 #include "core/common/status.h"
 #include "core/graph/basic_types.h"
 #include "core/providers/common.h"
-
 #include "core/providers/coreml/builders/coreml_spec.h"
+#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 class NodeArg;
 
 namespace coreml {
+class ModelBuilder;
+
 // Try to see if we can map explicit padding to auto padding for Conv/Pool
 // Since usually use auto padding is more efficient
 Status HandleAutoPad(const std::vector<int64_t> input_shape,
@@ -45,6 +47,7 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
 // Copy the int64_t array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int64_t> data);
 
+#if defined(COREML_ENABLE_MLPROGRAM)
 //
 // MLProgram utils
 //
@@ -130,5 +133,17 @@ void AddOperationInput(COREML_SPEC::MILSpec::Operation& op,
 /// <param name="op">Operation to update.</param>
 /// <param name="output">NodeArg with details of output to add.</param>
 void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output);
+
+/// <summary>
+/// Add pad_type and pad values.
+/// </summary>
+/// <param name="op">Operator to update</param>
+/// <param name="model_builder">ModelBuilder to add constants with.</param>
+/// <param name="op_type">Operator type.</param>
+/// <param name="helper">Node attribute helper.</param>
+/// <param name="num_spatial_dims">Number of spatial dims in input. Generally rank - 2 (ignore N and C dims).</param>
+void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
+                       const NodeAttrHelper& helper, int num_spatial_dims);
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
index 9aca172abec98..41f4041ef1181 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/shared/utils/utils.h"
@@ -17,11 +18,31 @@ class ClipOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void ClipOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  bool skip = true;
+
+  if (model_builder.CreateMLProgram()) {
+    float min, max;
+    ORT_IGNORE_RETURN_VALUE(GetClipMinMax(model_builder.GetGraphViewer(), node, min, max, model_builder.Logger()));
+
+    bool has_min = min != std::numeric_limits<float>::lowest();
+    bool has_max = max != std::numeric_limits<float>::max();
+    if (has_min && has_max && min == 0.f && max == 6.f) {
+      // relu6 - skip both
+    } else if (has_min && min == 0.f && !has_max) {
+      // relu - skip both
+    } else {
+      // clip - we will use both
+      skip = false;
+    }
+  }
+
   // Both min and max values will be injected into the layer, no need to add to the model
-  if (node.SinceVersion() >= 11) {
+  if (skip && node.SinceVersion() >= 11) {
     if (node.InputDefs().size() > 1)
       model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
 
@@ -35,72 +56,126 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                             const logging::Logger& logger) const {
   const auto& node_name = node.Name();
   const auto& input_name = node.InputDefs()[0]->Name();
-  const auto& output_name = node.OutputDefs()[0]->Name();
+  const auto& output = *node.OutputDefs()[0];
+  const auto& output_name = output.Name();
   float min, max;
   ORT_RETURN_IF_NOT(GetClipMinMax(model_builder.GetGraphViewer(), node, min, max, logger), "GetClipMinMax failed");
 
   bool has_min = min != std::numeric_limits<float>::lowest();
   bool has_max = max != std::numeric_limits<float>::max();
 
-  if (!has_min && !has_max) {
-    // Clip without min/max is an identity node
-    // In CoreML we don't have identity, use ActivationLinear instead
-    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-    layer->mutable_activation()->mutable_linear()->set_alpha(1.0f);
-    *layer->mutable_input()->Add() = input_name;
-    *layer->mutable_output()->Add() = output_name;
-
-    model_builder.AddLayer(std::move(layer));
-  } else {
-    // The implementation of clip(min, max) is done by
-    // 1. Clipping at min -> max(input, min) is handled by
-    //    min_output = threshold(input, min)
-    // 2. Clipping at max -> min(min_output, max) is handled by
-    //    output = -1 * (threshold(-min_output, -max))
-
-    // Now we have at least one or min or max is not default value
-    // Clipping at max will need take the output of clipping at min, or the node input, if min value is default
-    // If max value is default, the output of clipping at min will be the output of the node
-    std::string min_output_name = output_name;
-    if (has_max) {
-      min_output_name = has_min
-                            ? model_builder.GetUniqueName(node_name + "min_output")
-                            : input_name;
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::unique_ptr<Operation> op;
+    if (!has_min && !has_max) {
+      // Clip without min/max is an identity node.
+      op = model_builder.CreateOperation(node, "identity");
+      Operation& identity_op = *op;
+      AddOperationInput(identity_op, "x", input_name);
+    } else {
+      if (has_min && has_max && min == 0.f && max == 6.f) {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.activation.relu6
+        op = model_builder.CreateOperation(node, "relu6");
+        Operation& relu6_op = *op;
+        AddOperationInput(relu6_op, "x", input_name);
+      } else if (has_min && min == 0.f && !has_max) {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.activation.relu
+        op = model_builder.CreateOperation(node, "relu");
+        Operation& relu_op = *op;
+        AddOperationInput(relu_op, "x", input_name);
+      } else {
+        // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary.clip
+        op = model_builder.CreateOperation(node, "clip");
+
+        Operation& clip_op = *op;
+        AddOperationInput(clip_op, "x", input_name);
+
+        // if min and max were attributes we need to add initializers. otherwise we use the existing inputs
+        const bool min_max_attribs = node.SinceVersion() < 11;
+        std::string_view min_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "min", min)
+                                                    : node.InputDefs()[1]->Name();
+
+        AddOperationInput(clip_op, "alpha", min_name);
+
+        if (has_max) {
+          std::string_view max_name = min_max_attribs ? model_builder.AddScalarConstant(clip_op.type(), "max", max)
+                                                      : node.InputDefs()[2]->Name();
+          AddOperationInput(clip_op, "beta", max_name);
+        }
+      }
     }
 
-    // Handle clipping at min first
-    if (has_min) {
-      std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> min_layer = model_builder.CreateNNLayer(node, "_Clip_min");
-      if (min == 0.0f) {  // If min is 0. then this min will be handled by relu
-        min_layer->mutable_activation()->mutable_relu();
-      } else {  // otherwise, min will be handled by unary->threshold
-        min_layer->mutable_unary()->set_alpha(min);
-        min_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+    AddOperationOutput(*op, output);
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    // TODO: CoreML has a Clip layer for NeuralNetwork. Added in CoreML 4. We could potentially use that if available
+    // to simplify.
+    // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#cliplayerparams
+
+    if (!has_min && !has_max) {
+      // Clip without min/max is an identity node
+      // In CoreML we don't have identity, use ActivationLinear instead
+      std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+      layer->mutable_activation()->mutable_linear()->set_alpha(1.0f);
+      *layer->mutable_input()->Add() = input_name;
+      *layer->mutable_output()->Add() = output_name;
+
+      model_builder.AddLayer(std::move(layer));
+    } else {
+      // The implementation of clip(min, max) is done by
+      // 1. Clipping at min -> max(input, min) is handled by
+      //    min_output = threshold(input, min)
+      // 2. Clipping at max -> min(min_output, max) is handled by
+      //    output = -1 * (threshold(-min_output, -max))
+
+      // Now we have at least one or min or max is not default value
+      // Clipping at max will need take the output of clipping at min, or the node input, if min value is default
+      // If max value is default, the output of clipping at min will be the output of the node
+      std::string min_output_name = output_name;
+      if (has_max) {
+        min_output_name = has_min
+                              ? model_builder.GetUniqueName(node_name + "min_output")
+                              : input_name;
       }
 
-      *min_layer->mutable_input()->Add() = input_name;
-      *min_layer->mutable_output()->Add() = min_output_name;
-      model_builder.AddLayer(std::move(min_layer));
-    }
-
-    // Clipping at max is handled by -1 * (threshold (-min_output, -max))
-    if (has_max) {
-      const auto threshold_output_name = model_builder.GetUniqueName(MakeString(node_name, "threshold_output"));
-      {  // Add threshold layer, which is actually max( -1 * min_output, -max)
-        auto threshold_layer = model_builder.CreateNNLayer(node, "_Clip_max_threshold");
-        threshold_layer->mutable_unary()->set_alpha(-max);
-        threshold_layer->mutable_unary()->set_scale(-1.0f);
-        threshold_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
-        *threshold_layer->mutable_input()->Add() = min_output_name;
-        *threshold_layer->mutable_output()->Add() = threshold_output_name;
-        model_builder.AddLayer(std::move(threshold_layer));
+      // Handle clipping at min first
+      if (has_min) {
+        std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> min_layer = model_builder.CreateNNLayer(node, "_Clip_min");
+        if (min == 0.0f) {  // If min is 0. then this min will be handled by relu
+          min_layer->mutable_activation()->mutable_relu();
+        } else {  // otherwise, min will be handled by unary->threshold
+          min_layer->mutable_unary()->set_alpha(min);
+          min_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+        }
+
+        *min_layer->mutable_input()->Add() = input_name;
+        *min_layer->mutable_output()->Add() = min_output_name;
+        model_builder.AddLayer(std::move(min_layer));
       }
-      {  // Add linear activation layer -1 * threshold_output
-        auto linear_layer = model_builder.CreateNNLayer(node, "_Clip_max_linear");
-        linear_layer->mutable_activation()->mutable_linear()->set_alpha(-1.0f);
-        *linear_layer->mutable_input()->Add() = threshold_output_name;
-        *linear_layer->mutable_output()->Add() = output_name;
-        model_builder.AddLayer(std::move(linear_layer));
+
+      // Clipping at max is handled by -1 * (threshold (-min_output, -max))
+      if (has_max) {
+        const auto threshold_output_name = model_builder.GetUniqueName(MakeString(node_name, "threshold_output"));
+        {  // Add threshold layer, which is actually max( -1 * min_output, -max)
+          auto threshold_layer = model_builder.CreateNNLayer(node, "_Clip_max_threshold");
+          threshold_layer->mutable_unary()->set_alpha(-max);
+          threshold_layer->mutable_unary()->set_scale(-1.0f);
+          threshold_layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::THRESHOLD);
+          *threshold_layer->mutable_input()->Add() = min_output_name;
+          *threshold_layer->mutable_output()->Add() = threshold_output_name;
+          model_builder.AddLayer(std::move(threshold_layer));
+        }
+        {  // Add linear activation layer -1 * threshold_output
+          auto linear_layer = model_builder.CreateNNLayer(node, "_Clip_max_linear");
+          linear_layer->mutable_activation()->mutable_linear()->set_alpha(-1.0f);
+          *linear_layer->mutable_input()->Add() = threshold_output_name;
+          *linear_layer->mutable_output()->Add() = output_name;
+          model_builder.AddLayer(std::move(linear_layer));
+        }
       }
     }
   }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
index 05e43dbbd16af..38125957bf481 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -67,99 +67,25 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       AddOperationInput(*conv_op, "bias", input_defs[2]->Name());
     }
 
-    // ONNX attributes. Add as inputs if specified/required
-    auto strides = helper.GetInt64s("strides");
-    auto dilations = helper.GetInt64s("dilations");
-    auto groups = helper.GetInt64("group");
-
     // we know this input has a valid shape due to the check in IsOpSupportedImpl. ignore N and C dims.
     const auto num_spatial_dims = input_defs[1]->Shape()->dim_size() - 2;
     const auto& op_type = conv_op->type();
 
-    if (strides) {
-      AddOperationInput(*conv_op, "strides", model_builder.AddConstant(op_type, "strides", *strides));
-    } else {
-      // spec says optional. testing suggests otherwise for at least the iOS15 target (CoreML5)
-      static const auto default_value = std::vector<int64_t>(num_spatial_dims, 1);
-      AddOperationInput(*conv_op, "strides", model_builder.AddConstant(op_type, "strides", default_value));
-    }
+    // Spec says strides and dilations are optional, but reality is they're required for at least the iOS15 target
+    // (CoreML5).
+    const auto strides = helper.Get("strides", std::vector<int64_t>(num_spatial_dims, 1));
+    auto dilations = helper.Get("dilations", std::vector<int64_t>(num_spatial_dims, 1));
+    auto groups = helper.GetInt64("group");
 
-    if (dilations) {
-      AddOperationInput(*conv_op, "dilations", model_builder.AddConstant(op_type, "dilations", *dilations));
-    } else {
-      // spec says optional. testing suggests otherwise for at least the iOS15 target (CoreML5)
-      static const auto default_value = std::vector<int64_t>(num_spatial_dims, 1);
-      AddOperationInput(*conv_op, "dilations", model_builder.AddConstant(op_type, "dilations", default_value));
-    }
+    AddOperationInput(*conv_op, "strides", model_builder.AddConstant(op_type, "strides", strides));
+    AddOperationInput(*conv_op, "dilations", model_builder.AddConstant(op_type, "dilations", dilations));
 
     if (groups) {
       AddOperationInput(*conv_op, "groups", model_builder.AddScalarConstant(op_type, "groups", *groups));
     }
 
-    AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
-
-    // pad type (string)
-    //   valid - no pads  (ONNX auto_pad VALID)
-    //   custom - pads input  (ONNX NOTSET)
-    //   same - inferred to be `d_out[i] = ceil(d_in[i] / strides[i])`  (assuming == ONNX SAME_UPPER)
-    //   same_lower - as per same but any extra rows/cols are added at top/left if padding is odd (ONNX SAME_LOWER)
-    //
-    // TODO: See if we want to update HandleAutoPad to support 1D (and 3D) so we can infer if an autopad value
-    //       can be used. TBD if that provides any performance benefit with ML Program though as CoreML could
-    //       potentially do that for us.
-    switch (auto_pad_type) {
-      case AutoPadType::NOTSET: {
-        // use `pads` attribute.
-        auto onnx_pads = helper.GetInt64s("pads");  // 'pads' must be provided if auto_pad is NOTSET
-        if (onnx_pads) {
-          AddOperationInput(*conv_op, "pad_type",
-                            model_builder.AddScalarConstant(op_type, "pad_type", std::string("custom")));
-
-          // need to re-order from x1_start, x2_start..., x1_end, x2_end... to
-          // x1_start, x1_end, x2_start, x2_end,...
-          size_t num_pads = onnx_pads->size();
-          size_t num_dims = num_pads / 2;
-          std::vector<int64_t> reordered_pads(num_pads, 0);
-          for (size_t i = 0; i < num_pads; ++i) {
-            auto cur_dim = i % num_dims;
-            if (i < num_dims) {  // start values
-              reordered_pads[cur_dim * 2] = (*onnx_pads)[i];
-            } else {  // end values
-              reordered_pads[cur_dim * 2 + 1] = (*onnx_pads)[i];
-            }
-          }
-
-          AddOperationInput(*conv_op, "pad", model_builder.AddConstant(op_type, "pad", reordered_pads));
-
-          break;
-        }
-
-        // in theory the pads may not be provided and in that case the default is no padding.
-        // as that is the same as 'valid', fall through
-        [[fallthrough]];
-      }
-      case AutoPadType::VALID:
-        AddOperationInput(*conv_op, "pad_type",
-                          model_builder.AddScalarConstant(op_type, "pad_type", std::string("valid")));
-
-        break;
-      case AutoPadType::SAME_UPPER:
-      case AutoPadType::SAME_LOWER: {
-        const auto pad_type = (auto_pad_type == AutoPadType::SAME_UPPER ? "same" : "same_lower");
-        AddOperationInput(*conv_op, "pad_type",
-                          model_builder.AddScalarConstant(op_type, "pad_type", std::string(pad_type)));
-
-        // despite what the spec says, a 'pad' input seems to be required.
-        // https://github.com/apple/coremltools/issues/2127
-        // provide the default value. passing in an empty vector also works. TBD what's better.
-        std::vector<int64_t> ignored_pads(num_spatial_dims * 2, 0);
-        AddOperationInput(*conv_op, "pad", model_builder.AddConstant(op_type, "pad", ignored_pads));
-
-        break;
-      }
-    }
+    AddPadTypeAndPads(*conv_op, model_builder, op_type, helper, num_spatial_dims);
 
-    // set output
     AddOperationOutput(*conv_op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(conv_op));
@@ -297,7 +223,7 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
   const auto& input_defs = node.InputDefs();
 
   const auto& weight_name = input_defs[1]->Name();
-  const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name, true);
+  const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);
 
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
@@ -324,7 +250,7 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
     return false;
   }
 
-  if (input_defs.size() > 2 && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name(), true)) {
+  if (input_defs.size() > 2 && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
     LOGS(logger, VERBOSE) << "The bias of Conv [" << name << "] must be a constant initializer";
     return false;
   }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
index 48f77354d7c30..8daf64dc4a457 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
@@ -22,18 +22,51 @@ class GemmOpBuilder : public BaseOpBuilder {
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
 
-  bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */,
-                         const logging::Logger& /* logger */) const override;
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
   const auto& op = node.OpType();
   const auto& input_defs(node.InputDefs());
-  // We have already embedded the weights (matrix B and C(if any)) into the coreml layer
-  // No need to copy them later to reduce memory consumption
-  model_builder.AddInitializerToSkip(input_defs[1]->Name());
-  if (op == "Gemm" && input_defs.size() > 2) {
-    model_builder.AddInitializerToSkip(input_defs[2]->Name());
+  const bool is_gemm = op == "Gemm";
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    // we have to transpose the weight input of Gemm if transB is false, and potentially override the bias shape
+    if (is_gemm) {
+      NodeAttrHelper helper(node);
+      const auto transB = helper.Get("transB", 0);
+      if (transB == 0) {
+        model_builder.AddInitializerToSkip(input_defs[1]->Name());
+      }
+
+      if (input_defs.size() > 2) {
+        // ONNX spec requires B to be 2D and we required it to be a constant initializer so reading N this way is safe
+        // B is {K, N] by default. or {N, K} if transB is true
+        int N_dim = transB ? 0 : 1;
+        int64_t N = input_defs[1]->Shape()->dim().at(N_dim).dim_value();
+
+        const auto& bias_name = input_defs[2]->Name();
+        const auto& bias = *model_builder.GetConstantInitializer(bias_name);
+        if (bias.dims_size() != 1 || bias.dims(0) != N) {
+          // we have to override the shape/duplicate data to convert {}, {1} or {1, N} to 1D {N}
+          // when adding the Gemm operation so skip adding the original initializer
+          model_builder.AddInitializerToSkip(bias_name);
+        }
+      }
+    }
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    // We have already embedded the weights (matrix B and C(if any)) into the coreml layer
+    // No need to copy them later to reduce memory consumption
+    model_builder.AddInitializerToSkip(input_defs[1]->Name());
+    if (is_gemm && input_defs.size() > 2) {
+      model_builder.AddInitializerToSkip(input_defs[2]->Name());
+    }
   }
 }
 
@@ -57,54 +90,152 @@ static Status GetTensorFloatDataTransposed(const ONNX_NAMESPACE::TensorProto& te
 }
 
 Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
-                                            const logging::Logger& /* logger */) const {
+                                            const logging::Logger& logger) const {
   std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
-  const auto& b_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
-  const auto& b_shape = b_tensor.dims();
-
-  auto* coreml_inner_product = layer->mutable_innerproduct();
-
-  // The coreml innerproduct weight (matrix B) is stored transposed
-  // - for MatMul and Gemm (transB = 0), the coreml weight is B'
-  // - for Gemm (transB = 1), the coreml weight is B
-  if (op_type == "MatMul") {
-    coreml_inner_product->set_inputchannels(b_shape[0]);
-    coreml_inner_product->set_outputchannels(b_shape[1]);
-    // Add weight (b of MatMul)
-    std::vector<float> b_transposed;
-    ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(b_tensor, b_transposed));
-    CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_transposed);
-  } else {  // Gemm
-    NodeAttrHelper helper(node);
-    const auto transB = helper.Get("transB", 0);
-    if (transB == 0) {
-      coreml_inner_product->set_inputchannels(b_shape[0]);
-      coreml_inner_product->set_outputchannels(b_shape[1]);
+  const auto& a = *input_defs[0];
+  const auto& b = *input_defs[1];
+  const auto* b_initializer = model_builder.GetConstantInitializer(b.Name());  // MLProgram MatMul may not be constant
+
+  const bool is_matmul = op_type == "MatMul";
+  const bool is_gemm = op_type == "Gemm";
+
+  NodeAttrHelper helper(node);
+  const auto transB = is_gemm ? helper.Get("transB", 0) : 0;
+
+  std::vector<int64_t> b_shape;
+  ORT_IGNORE_RETURN_VALUE(GetShape(b, b_shape, logger));
+  int64_t b0 = -1, b1 = -1;
+
+  // ML Program MatMul supports N-D input
+  if (model_builder.CreateMLProgram() && is_matmul) {
+    if (b_shape.size() == 1) {
+      // B is treated as {b_shape[0], 1} according to the numpy rules.
+      b0 = b_shape[0];
+      b1 = 1;
+    } else {
+      // last 2 dims are used
+      b0 = b_shape[b_shape.size() - 2];
+      b1 = b_shape[b_shape.size() - 1];
+    }
+  } else {
+    // we only support 2D input
+    b0 = b_shape[0];
+    b1 = b_shape[1];
+  }
+
+  // B is {K, N} in ONNX spec by default, or {N, K} in Gemm if transB is true
+  const auto K = transB ? b1 : b0;
+  const auto N = transB ? b0 : b1;
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    if (is_gemm) {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.linear.linear
+      auto gemm_op = model_builder.CreateOperation(node, "linear");
+      AddOperationInput(*gemm_op, "x", a.Name());
+
+      // CoreML takes weight input as {N, K} which is the reverse of ONNX.
+      // if transB is true the input weight is {N, K} so can be added directly.
+      if (transB) {
+        AddOperationInput(*gemm_op, "weight", b.Name());
+      } else {
+        // transpose from {K, N} to {N, K}
+        std::vector<float> weight_nk;
+        std::vector<int64_t> weight_nk_shape = {N, K};
+        ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(*b_initializer, weight_nk));
+
+        AddOperationInput(*gemm_op, "weight",
+                          model_builder.AddConstant(gemm_op->type(), b.Name() + "_t", weight_nk, weight_nk_shape));
+      }
+
+      if (input_defs.size() == 3) {
+        const auto& bias_arg = *input_defs[2];
+        const auto& bias = *model_builder.GetConstantInitializer(bias_arg.Name());
+
+        // CoreML linear op requires bias to be 1D tensor of size N
+        if (bias.dims_size() == 1 && bias.dims().at(0) == N) {
+          // can use existing initializer
+          AddOperationInput(*gemm_op, "bias", bias_arg.Name());
+        } else {
+          Initializer unpacked_tensor(bias);
+          auto bias_data = unpacked_tensor.DataAsSpan<float>();
+          std::string_view bias_data_name;
+          if (bias_data.size() == 1) {
+            // expand scalar to N
+            std::vector<float> expanded_bias_data(N, bias_data[0]);
+            bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", expanded_bias_data);
+          } else {
+            // can use data as-is but need to adjust shape (inferred by AddConstant as {bias_data.size()})
+            bias_data_name = model_builder.AddConstant(gemm_op->type(), "bias", bias_data);
+          }
+
+          AddOperationInput(*gemm_op, "bias", bias_data_name);
+        }
+      }
+
+      AddOperationOutput(*gemm_op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(gemm_op));
+    } else {
+      // CoreML implementation is the same as ONNX MatMul.
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.linear.matmul
+      auto matmul_op = model_builder.CreateOperation(node, "matmul");
+      AddOperationInput(*matmul_op, "x", a.Name());
+      AddOperationInput(*matmul_op, "y", b.Name());
+
+      // once again the spec lies and says transpose_y and transpose_x are optional...
+      auto false_value_name = model_builder.AddScalarConstant(matmul_op->type(), "false", false);
+      AddOperationInput(*matmul_op, "transpose_x", false_value_name);
+      AddOperationInput(*matmul_op, "transpose_y", false_value_name);
+
+      AddOperationOutput(*matmul_op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(matmul_op));
+    }
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    auto* coreml_inner_product = layer->mutable_innerproduct();
+
+    *layer->mutable_input()->Add() = a.Name();
+
+    coreml_inner_product->set_inputchannels(K);
+    coreml_inner_product->set_outputchannels(N);
+
+    // CoreML takes weight input as {N, K} which is the reverse of ONNX.
+    // if Gemm's transB is true the input weight is {N, K} and can be added directly.
+    if (transB) {
+      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), *b_initializer));
+    } else {
       std::vector<float> b_transposed;
-      ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(b_tensor, b_transposed));
+      ORT_RETURN_IF_ERROR(GetTensorFloatDataTransposed(*b_initializer, b_transposed));
       CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_transposed);
-    } else {
-      coreml_inner_product->set_inputchannels(b_shape[1]);
-      coreml_inner_product->set_outputchannels(b_shape[0]);
-      // Add weight (b of MatMul)
-      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_weights(), b_tensor));
     }
 
-    // Add bias if present
-    if (input_defs.size() > 2) {
+    if (is_gemm && input_defs.size() > 2) {
+      // Add bias
       coreml_inner_product->set_hasbias(true);
-      const auto& bias_tensor = *model_builder.GetInitializerTensors().at(input_defs[2]->Name());
-      ORT_RETURN_IF_ERROR(CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), bias_tensor));
+      const auto& bias_tensor = *model_builder.GetConstantInitializer(input_defs[2]->Name());
+
+      // if scalar, or single value expand to 1D tensor of size N
+      // IsOpSupportedImpl enforces it's scalar, {1}, {N}, or {1, N}.
+      Initializer unpacked_tensor(bias_tensor);
+      auto bias_data = unpacked_tensor.DataAsSpan<float>();
+      if (bias_data.size() == 1 && N > 1) {
+        std::vector<float> expanded_bias_data(N, bias_data[0]);
+        CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), expanded_bias_data);
+      } else {
+        CreateCoreMLWeight(*coreml_inner_product->mutable_bias(), bias_data);
+      }
     }
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
@@ -112,98 +243,105 @@ bool GemmOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
                                       const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
   const auto& input_defs(node.InputDefs());
+  const bool is_matmul = op_type == "MatMul";
+  const bool is_gemm = op_type == "Gemm";
+
   size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
 
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (!Contains(initializers, input_defs[b_idx]->Name())) {
-    LOGS(logger, VERBOSE) << "B of Gemm/Matmul must be an initializer tensor";
+  std::vector<int64_t> a_shape;
+  if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
     return false;
   }
 
-  std::vector<int64_t> a_shape;
-  {
-    if (!GetShape(*input_defs[a_idx], a_shape, logger))
-      return false;
-
-    if (a_shape.size() != 2) {
-      LOGS(logger, VERBOSE) << "A must be 2D";
-      return false;
-    }
+  std::vector<int64_t> b_shape;
+  if (!GetShape(*input_defs[b_idx], b_shape, logger)) {
+    return false;
+  }
 
-    // TODO is it ok if the shape is dynamic and empty?
-    if (Product(a_shape) == 0) {
-      LOGS(logger, VERBOSE) << "A must be non-empty";
+  if (!input_params.graph_viewer.GetConstantInitializer(input_defs[b_idx]->Name())) {
+    if (input_params.create_mlprogram && is_matmul) {
+      // ML Program MatMul allows non-constant B input
+    } else {
+      LOGS(logger, VERBOSE) << op_type << " B input must be a constant initializer";
       return false;
     }
   }
 
-  std::vector<int64_t> b_shape;
-  {
-    if (!GetShape(*input_defs[b_idx], b_shape, logger))
-      return false;
-
-    if (b_shape.size() != 2) {
-      LOGS(logger, VERBOSE) << "B must be 2D";
-      return false;
-    }
+  if (is_matmul) {
+    if (input_params.create_mlprogram) {
+      // ML Program matmul op has numpy semantics the same as the ONNX spec so we can use directly
+    } else {
+      // we could potentially support 1D and 3D if required. beyond 3D the dims that merge diverge.
+      // https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/onnx/_operators.py#L1607
+      // https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/backend/nn/op_mapping.py#L1374
+      // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#innerproductlayerparams
+      if (a_shape.size() != 2 || b_shape.size() != 2) {
+        LOGS(logger, VERBOSE) << "a and b inputs must be 2D. ";
+        return false;
+      }
 
-    if (Product(b_shape) == 0) {
-      LOGS(logger, VERBOSE) << "B must be non-empty";
-      return false;
+      if (input_defs.size() > 2) {
+        LOGS(logger, VERBOSE) << "MatMul with C input is not supported";
+        return false;
+      }
     }
   }
 
-  if (op_type == "Gemm") {
+  if (is_gemm) {
+    // A and B are 2D due to the ONNX spec
     NodeAttrHelper helper(node);
     const auto transA = helper.Get("transA", 0);
     const auto transB = helper.Get("transB", 0);
     const auto alpha = helper.Get("alpha", 1.0f);
     const auto beta = helper.Get("beta", 1.0f);
+
+    // TODO: We can support transA, alpha and beta by using multiple layers/operations if needed.
     if (!(transA == 0 && alpha == 1.f && beta == 1.f)) {
-      LOGS(logger, VERBOSE) << "Only transA == 0, alpha == 1.0 "
-                            << "and beta == 1.0 is supported."
+      LOGS(logger, VERBOSE) << "Only support for transA == 0, alpha == 1.0 "
+                            << "and beta == 1.0 is currently implemented."
                             << " transA " << transA
                             << " alpha " << alpha
                             << " beta " << beta;
       return false;
     }
 
-    // C of Gemm
-    // For now we only support {n} or {1,n} tensor
     if (input_defs.size() == 3) {
-      if (!Contains(initializers, input_defs[c_idx]->Name())) {
-        LOGS(logger, VERBOSE) << "C of Gemm must be an initializer tensor";
+      if (!input_params.graph_viewer.GetConstantInitializer(input_defs[c_idx]->Name())) {
+        LOGS(logger, VERBOSE) << "C of Gemm must be a constant initializer";
         return false;
       }
 
       std::vector<int64_t> c_shape;
-      if (!GetShape(*input_defs[c_idx], c_shape, logger))
+      if (!GetShape(*input_defs[c_idx], c_shape, logger)) {
         return false;
+      }
 
-      size_t c_dim = c_shape.size();
+      // B is {K, N} in ONNX spec by default, or {N, K} in Gemm if transB is true
+      const auto N = transB ? b_shape[0] : b_shape[1];
 
-      if (c_dim == 0) {
-        LOGS(logger, VERBOSE) << "C of Gemm cannot be a scalar";
-        return false;
-      }
+      size_t c_rank = c_shape.size();
 
-      if (c_dim != 1) {
-        // If C is a (2+)d tensor, it must have the format {1, 1, ..., 1, n}
-        // where every except the last dimension should be 1
-        for (size_t i = 0; i < c_dim - 1; ++i) {
-          if (c_shape[i] != 1) {
-            LOGS(logger, VERBOSE) << "C of Gemm must be a vector or a tensor with only last dimension != 1";
-            return false;
+      // allowed: scalar, or 1D where the value is 1 or N, 2D with shape {1, N}
+      bool c_valid = false;
+      switch (c_rank) {
+        case 0:
+          c_valid = true;
+          break;
+        case 1:
+          if (c_shape[0] == 1 || c_shape[0] == N) {
+            c_valid = true;
           }
-        }
+          break;
+        case 2:
+          if (c_shape[0] == 1 && c_shape[1] == N) {
+            c_valid = true;
+          }
+          break;
       }
 
-      auto c_size = c_shape[c_dim - 1];
-      if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) {
-        LOGS(logger, VERBOSE) << "C of Gemm must be a vector of b_shape["
-                              << (transB == 0 ? "1" : "0") << "]"
-                              << " b_shape: [" << b_shape[0] << ", " << b_shape[1] << "]"
-                              << " c_size: " << c_size;
+      if (!c_valid) {
+        LOGS(logger, VERBOSE) << "Shape of C Gemm input must be {}, {1}, {N}, or {1, N}. N:" << N << " C shape:"
+                              << Shape2String(c_shape);
 
         return false;
       }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
index 01aced739b36d..17910ba6fd486 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
@@ -19,104 +19,176 @@ class PoolOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                             const Node& node,
                                             const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
-  auto* coreml_pool = layer->mutable_pooling();
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
 
-  bool is_global_pooling = false;
-  if (op_type == "GlobalAveragePool") {
-    is_global_pooling = true;
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
-  } else if (op_type == "GlobalMaxPool") {
-    is_global_pooling = true;
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
-  } else if (op_type == "AveragePool") {
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
-  } else if (op_type == "MaxPool") {
-    coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unknown op: ", op_type);
-  }
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::string_view coreml_op_type;
+    bool is_global = false;
+    bool is_avg_pool = false;
+    if (op_type == "GlobalAveragePool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.reduction.reduce_mean
+      coreml_op_type = "reduce_mean";
+      is_global = true;
+    } else if (op_type == "GlobalMaxPool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.reduction.reduce_max
+      coreml_op_type = "reduce_max";
+      is_global = true;
+    } else if (op_type == "AveragePool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.pool.avg_pool
+      coreml_op_type = "avg_pool";
+      is_avg_pool = true;
+    } else if (op_type == "MaxPool") {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.pool.max_pool
+      coreml_op_type = "max_pool";
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unexpected op: ", op_type);
+    }
 
-  if (is_global_pooling) {
-    coreml_pool->set_globalpooling(true);
-    coreml_pool->mutable_valid();
-  } else {  // AveragePool or MaxPool
-    NodeAttrHelper helper(node);
-    const auto kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{0, 0});
-    const auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
-    const auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
-
-    coreml_pool->add_kernelsize(kernel_shape[0]);
-    coreml_pool->add_kernelsize(kernel_shape[1]);
-    coreml_pool->add_stride(strides[0]);
-    coreml_pool->add_stride(strides[1]);
-    coreml_pool->set_avgpoolexcludepadding(helper.Get("count_include_pad", 0) == 0);
-    coreml_pool->set_globalpooling(false);
-
-    // Add Padding
-    // Usually using autopadding is more efficient than using explicit padding
-    // Try to see if we can map explicit padding to auto padding
-    std::vector<int64_t> input_shape;
-    ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
-    AutoPadType auto_pad_type;
-    ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, kernel_shape[0], kernel_shape[1],
-                                      onnx_pads, strides, {1, 1} /* dilations */,
-                                      StringToAutoPadType(helper.Get("auto_pad", "NOTSET")),
-                                      auto_pad_type));
-
-    if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
-      auto* padding_type = coreml_pool->mutable_same();
-      if (AutoPadType::SAME_LOWER == auto_pad_type) {  // default is SAME_UPPER
-        padding_type->set_asymmetrymode(COREML_SPEC::SamePadding_SamePaddingMode_TOP_LEFT_HEAVY);
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+
+    if (is_global) {
+      // keep N and C dims, reduce the rest with keepdims=True. equivalent to the ONNX Global*Pool ops.
+      std::vector<int64_t> axes{2, 3};  // we only support 4D input currently.
+      AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", axes));
+      AddOperationInput(*op, "keep_dims", model_builder.AddScalarConstant(op->type(), "keep_dims", true));
+    } else {
+      NodeAttrHelper helper(node);
+      constexpr int num_spatial_dims = 2;  // we only support 4D. -2 for N and C dims.
+
+      AddPadTypeAndPads(*op, model_builder, op->type(), helper, num_spatial_dims);
+
+      const auto kernel_shape = helper.GetInt64s("kernel_shape");  // required
+      AddOperationInput(*op, "kernel_sizes", model_builder.AddConstant(op->type(), "kernel_sizes", *kernel_shape));
+
+      // in theory all these values are optional according to the CoreML spec but simpler to just provide default
+      // values as the actual model compilation tends to require them.
+      const auto strides = helper.Get("strides", std::vector<int64_t>(num_spatial_dims, 1));
+      const bool ceil_mode = helper.Get("ceil_mode", int64_t(0));  // convert int64_t to bool
+
+      AddOperationInput(*op, "strides", model_builder.AddConstant(op->type(), "strides", strides));
+      AddOperationInput(*op, "ceil_mode", model_builder.AddScalarConstant(op->type(), "ceil_mode", ceil_mode));
+
+      if (is_avg_pool) {
+        const bool count_exclude_pad = helper.Get("count_include_pad", int64_t(0)) == 0;
+        AddOperationInput(*op, "exclude_padding_from_average",
+                          model_builder.AddScalarConstant(op->type(), "count_exclude_pad", count_exclude_pad));
       }
+    }
+
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
+
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    auto* coreml_pool = layer->mutable_pooling();
+
+    bool is_global_pooling = false;
+    if (op_type == "GlobalAveragePool") {
+      is_global_pooling = true;
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
+    } else if (op_type == "GlobalMaxPool") {
+      is_global_pooling = true;
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
+    } else if (op_type == "AveragePool") {
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_AVERAGE);
+    } else if (op_type == "MaxPool") {
+      coreml_pool->set_type(COREML_SPEC::PoolingLayerParams_PoolingType_MAX);
     } else {
-      auto* padding_type = coreml_pool->mutable_valid();
-      if (AutoPadType::NOTSET == auto_pad_type && onnx_pads != std::vector<int64_t>{0, 0, 0, 0}) {
-        // NOTSET is adding the explicit padding to the ValidPadding.paddingAmounts
-        auto* height_border = padding_type->mutable_paddingamounts()->add_borderamounts();
-        height_border->set_startedgesize(onnx_pads[0]);
-        height_border->set_endedgesize(onnx_pads[2]);
-        auto* width_border = padding_type->mutable_paddingamounts()->add_borderamounts();
-        width_border->set_startedgesize(onnx_pads[1]);
-        width_border->set_endedgesize(onnx_pads[3]);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "PoolOpBuilder, unexpected op: ", op_type);
+    }
+
+    if (is_global_pooling) {
+      coreml_pool->set_globalpooling(true);
+      coreml_pool->mutable_valid();
+    } else {  // AveragePool or MaxPool
+      NodeAttrHelper helper(node);
+      const auto kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{0, 0});
+      const auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
+      const auto onnx_pads = helper.Get("pads", std::vector<int64_t>{0, 0, 0, 0});
+
+      coreml_pool->add_kernelsize(kernel_shape[0]);
+      coreml_pool->add_kernelsize(kernel_shape[1]);
+      coreml_pool->add_stride(strides[0]);
+      coreml_pool->add_stride(strides[1]);
+      coreml_pool->set_avgpoolexcludepadding(helper.Get("count_include_pad", 0) == 0);
+      coreml_pool->set_globalpooling(false);
+
+      // Add Padding
+      // Usually using autopadding is more efficient than using explicit padding
+      // Try to see if we can map explicit padding to auto padding
+      std::vector<int64_t> input_shape;
+      ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+      AutoPadType auto_pad_type;
+      ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, kernel_shape[0], kernel_shape[1],
+                                        onnx_pads, strides, {1, 1} /* dilations */,
+                                        StringToAutoPadType(helper.Get("auto_pad", "NOTSET")),
+                                        auto_pad_type));
+
+      if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
+        auto* padding_type = coreml_pool->mutable_same();
+        if (AutoPadType::SAME_LOWER == auto_pad_type) {  // default is SAME_UPPER
+          padding_type->set_asymmetrymode(COREML_SPEC::SamePadding_SamePaddingMode_TOP_LEFT_HEAVY);
+        }
+      } else {
+        auto* padding_type = coreml_pool->mutable_valid();
+        if (AutoPadType::NOTSET == auto_pad_type && onnx_pads != std::vector<int64_t>{0, 0, 0, 0}) {
+          // NOTSET is adding the explicit padding to the ValidPadding.paddingAmounts
+          auto* height_border = padding_type->mutable_paddingamounts()->add_borderamounts();
+          height_border->set_startedgesize(onnx_pads[0]);
+          height_border->set_endedgesize(onnx_pads[2]);
+          auto* width_border = padding_type->mutable_paddingamounts()->add_borderamounts();
+          width_border->set_startedgesize(onnx_pads[1]);
+          width_border->set_endedgesize(onnx_pads[3]);
+        }
       }
     }
-  }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
-bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */,
+bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                       const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
 
   std::vector<int64_t> input_shape;
-  if (!GetShape(*input_defs[0], input_shape, logger))
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
     return false;
+  }
 
+  // TODO: ML Program supports 3D and 5D. Add if we have a use case for that.
   const auto input_size = input_shape.size();
   if (input_size != 4) {
-    LOGS(logger, VERBOSE)
-        << op_type << " only supports rank-4 tensor, input ["
-        << input_defs[0]->Name() << "] has actual dim count " << input_size;
+    LOGS(logger, VERBOSE) << op_type << " only supports rank-4 tensor, input ["
+                          << input_defs[0]->Name() << "] has actual dim count " << input_size;
     return false;
   }
 
   if (op_type == "AveragePool" || op_type == "MaxPool") {
     NodeAttrHelper helper(node);
+
     const auto storage_order = helper.Get("storage_order", 0);
     if (storage_order == 1) {
       LOGS(logger, VERBOSE) << "storage_order == 1 is not supported";
@@ -128,12 +200,14 @@ bool PoolOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
       return false;
     }
 
-    // TODO, add support of the ceil_mode by adjusting the padding
-    // See https://stackoverflow.com/questions/59906456/in-pytorchs-maxpool2d-is-padding-added-depending-on-ceil-mode
-    // and https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/frontend/torch/ops.py#L621-L644
-    if (helper.Get("ceil_mode", 0) == 1) {
-      LOGS(logger, VERBOSE) << "ceil_mode == 1 is not supported for pooling";
-      return false;
+    if (!input_params.create_mlprogram) {
+      // TODO, add support of the ceil_mode by adjusting the padding
+      // See https://stackoverflow.com/questions/59906456/in-pytorchs-maxpool2d-is-padding-added-depending-on-ceil-mode
+      // and https://github.com/apple/coremltools/blob/1931758aae383c83daddfc56f11a24a9d2bf4b87/coremltools/converters/mil/frontend/torch/ops.py#L621-L644
+      if (helper.Get("ceil_mode", 0) == 1) {
+        LOGS(logger, VERBOSE) << "ceil_mode == 1 is not supported for pooling";
+        return false;
+      }
     }
 
     if (helper.Get("dilations", std::vector<int32_t>{1, 1}) !=
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
index 7ae1746be3122..27d24d9c21893 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
@@ -1,11 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
-#include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -26,34 +25,56 @@ class ReshapeOpBuilder : public BaseOpBuilder {
 
   // Reshape opset 4- uses attributes for new shape which we do not support for now
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 5; }
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void ReshapeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  // Skip the second input which is the new shape as we always have to create a new version as the CoreML rules
+  // are different from ONNX.
   model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());
 }
 
 Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
   const auto& input_defs = node.InputDefs();
-  const auto& initializers(model_builder.GetInitializerTensors());
-  const auto& target_shape_tensor = *initializers.at(input_defs[1]->Name());
-  const int64_t* raw_target_shape = target_shape_tensor.int64_data().empty()
-                                        ? reinterpret_cast<const int64_t*>(target_shape_tensor.raw_data().data())
-                                        : target_shape_tensor.int64_data().data();
-
-  const auto size = target_shape_tensor.dims()[0];
-  TensorShapeVector target_shape{raw_target_shape, raw_target_shape + size};
   std::vector<int64_t> input_shape;
-  ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Cannot get shape");
-  ReshapeHelper helper(TensorShape(input_shape), target_shape);
-  *layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+  ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Cannot get shape of data");
+
+  const auto& data_name = input_defs[0]->Name();
+  const auto& new_shape_name = input_defs[1]->Name();
+  Initializer unpacked_tensor(*model_builder.GetConstantInitializer(new_shape_name));
+  TensorShapeVector new_shape = ToShapeVector(unpacked_tensor.DataAsSpan<int64_t>());
+
+  // ReshapeHelper applies the ONNX rules to create the concrete output shape
+  ReshapeHelper helper(TensorShape(input_shape), new_shape);
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
 
-  model_builder.AddLayer(std::move(layer));
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.reshape
+    std::unique_ptr<Operation> reshape_op = model_builder.CreateOperation(node, "reshape");
+
+    AddOperationInput(*reshape_op, "x", data_name);
+    AddOperationInput(*reshape_op, "shape",
+                      model_builder.AddConstant(reshape_op->type(), "shape", ToConstSpan(new_shape)));
+
+    AddOperationOutput(*reshape_op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(reshape_op));
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    *layer->mutable_reshapestatic()->mutable_targetshape() = {new_shape.cbegin(), new_shape.cend()};
+    *layer->mutable_input()->Add() = data_name;
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
@@ -61,14 +82,15 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
                                          const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& new_shape_name = input_defs[1]->Name();
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
-  if (!Contains(initializers, new_shape_name)) {
+  const auto* new_shape_tensor = input_params.graph_viewer.GetConstantInitializer(new_shape_name);
+  if (!new_shape_tensor) {
+    // ONNX has different rules around how -1 and 0 values are used/combined, and
+    // we can't check if those can be translated to CoreML if the shape is unknown.
     LOGS(logger, VERBOSE) << "New shape of reshape must be a constant initializer";
     return false;
   }
 
-  const auto& new_shape_tensor = *initializers.at(new_shape_name);
-  Initializer unpacked_tensor(new_shape_tensor);
+  Initializer unpacked_tensor(*new_shape_tensor);
   auto new_shape = unpacked_tensor.DataAsSpan<int64_t>();
   if (new_shape.empty()) {
     LOGS(logger, VERBOSE) << "New shape of reshape cannot be empty";
@@ -84,7 +106,7 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
     return false;
   }
 
-  // CoreML reshape doesn't support new shape with more than 5 dimensions
+  // CoreML reshape doesn't support new shape with more than 5 dimensions.
   if (new_shape.size() > 5) {
     LOGS(logger, VERBOSE) << "Reshape does not support new shape with rank greater than 5. Input shape: "
                           << Shape2String(input_shape) << ", new shape: " << Shape2String(new_shape);
@@ -93,7 +115,7 @@ bool ReshapeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputP
 
   // CoreML reshape does not support 0 as dimension
   NodeAttrHelper helper(node);
-  const bool allow_zero = helper.Get("allowzero ", 0) == 1;
+  const bool allow_zero = helper.Get("allowzero", 0) == 1;
   if (allow_zero) {
     if (std::find(new_shape.begin(), new_shape.end(), int64_t{0}) != new_shape.end()) {
       LOGS(logger, VERBOSE) << "Reshape does not support new shape with 0 as dimension when allowzero is enabled. "
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 35dcde41a6bcf..6c2fcc2ace856 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -98,7 +98,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto& input_defs = node.InputDefs();
   const auto& initializers(model_builder.GetInitializerTensors());
 
-  if (input_defs.size() == 3) {  // use scales
+  if (input_defs.size() >= 3 && input_defs[2]->Exists()) {  // use scales
     std::vector<float> scales;
     ORT_RETURN_IF_NOT(GetResizeScales(initializers, node, scales, logger), "Error getting resize scales");
     coreml_upsample->add_scalingfactor(static_cast<int64_t>(scales[2]));
@@ -182,20 +182,24 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
       return false;
     }
 
+    bool using_scales = input_defs.size() >= 3 && input_defs[2]->Exists();
     // scales
-    if (input_defs.size() == 3 && !Contains(initializers, input_defs[2]->Name())) {
-      LOGS(logger, VERBOSE) << "Input scales of Resize must be known";
+    if (using_scales && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
+      LOGS(logger, VERBOSE) << "scales input of Resize must be a constant initializer";
       return false;
     }
 
     // sizes
-    if (input_defs.size() > 3 && !Contains(initializers, input_defs[3]->Name())) {
-      LOGS(logger, VERBOSE) << "Input sizes of Resize must be known";
+    if (!using_scales &&
+        (input_defs.size() < 4 ||
+         !input_defs[3]->Exists() ||
+         !input_params.graph_viewer.GetConstantInitializer(input_defs[3]->Name()))) {
+      LOGS(logger, VERBOSE) << "sizes input of Resize must be a constant initializer";
       return false;
     }
 
     // We want to check if the scales or sizes are not trying to resize on N/C channels here
-    if (input_defs.size() == 3) {  // we are using scales
+    if (using_scales) {
       std::vector<float> scales;
       if (!GetResizeScales(initializers, node, scales, logger))
         return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
index b716af738e1b1..39bfbfe5bba1f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
@@ -54,7 +54,7 @@ Status PrepareSliceComputeMetadataFromConstantInitializers(const Node& slice_nod
       return Status::OK();
     }
 
-    const auto* tensor_proto = graph_viewer.GetConstantInitializer(input_defs[input_idx]->Name(), true);
+    const auto* tensor_proto = graph_viewer.GetConstantInitializer(input_defs[input_idx]->Name());
     ORT_RETURN_IF_NOT(tensor_proto, "Failed to get constant initializer.");
     Initializer unpacked_tensor(*tensor_proto, graph_viewer.ModelPath());
     const auto data_type = unpacked_tensor.data_type();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
index 266396a0fe90e..d6584124c6aba 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@@ -52,7 +52,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     target_shape.push_back(size_to_dimension);
     target_shape.push_back(size_from_dimension);
 
-    const auto reshape1_output_name = model_builder.GetUniqueName(MakeString(node.Name(), "reshape1_output"));
+    const auto reshape1_output_name = model_builder.GetUniqueName(node, "reshape1_output");
     {  // Add reshape layer
       auto reshape_layer = model_builder.CreateNNLayer(node, "_Softmax_reshape1");
       *reshape_layer->mutable_reshapestatic()->mutable_targetshape() = {target_shape.cbegin(), target_shape.cend()};
@@ -60,7 +60,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       *reshape_layer->mutable_output()->Add() = reshape1_output_name;
       model_builder.AddLayer(std::move(reshape_layer));
     }
-    const auto softmax_output_name = model_builder.GetUniqueName(MakeString(node.Name(), "softmax_output"));
+    const auto softmax_output_name = model_builder.GetUniqueName(node, "softmax_output");
     {
       auto* coreml_softmaxnd = layer->mutable_softmaxnd();
       coreml_softmaxnd->set_axis(-1);
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index daab36f7b933d..eb4723a3b9746 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -144,14 +144,18 @@ void CopyOnnxTensorToCoreMLTensor(const ONNX_NAMESPACE::TensorProto& tensor_prot
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-      // from: int64_data/raw, to: longints
-      if (has_raw_data) {
-        CopyRawDataToRepeatedField<int64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
-
-      } else {
-        tensor_value.mutable_longints()->mutable_values()->CopyFrom(tensor_proto.int64_data());
-      }
-      break;
+      // enable when this is proven to not be the case
+      ORT_THROW(
+          "INT64 is unexpected as CoreML uses 32-bit int for indices. "
+          "Most likely an initializer that should have been skipped was not.");
+      //// from: int64_data/raw, to: longints
+      // if (has_raw_data) {
+      //   CopyRawDataToRepeatedField<int64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
+
+      //} else {
+      //  tensor_value.mutable_longints()->mutable_values()->CopyFrom(tensor_proto.int64_data());
+      //}
+      // break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
       // from: int32_data/raw, to: bytes
@@ -186,18 +190,22 @@ void CopyOnnxTensorToCoreMLTensor(const ONNX_NAMESPACE::TensorProto& tensor_prot
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_UINT64: {
-      // from: uint64_data/raw, to: longints
-      if (has_raw_data) {
-        CopyRawDataToRepeatedField<uint64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
-      } else {
-        // TODO: Is this safe? Need to check the CopyFrom implementation. As it's a straight copy of bytes this
-        // hopefully can do it as one block instead of iterating and potentially doing a static_cast of each
-        // individual value.
-        tensor_value.mutable_longints()->mutable_values()->CopyFrom(
-            reinterpret_cast<const google::protobuf::RepeatedField<int64_t>&>(tensor_proto.uint64_data()));
-      }
-
-      break;
+      // enable when this is proven to not be the case
+      ORT_THROW(
+          "UINT64 is unexpected as CoreML uses 32-bit int for indices. "
+          "Most likely an initializer that should have been skipped was not.");
+      //// from: uint64_data/raw, to: longints
+      // if (has_raw_data) {
+      //   CopyRawDataToRepeatedField<uint64_t>(tensor_proto, *tensor_value.mutable_longints()->mutable_values());
+      // } else {
+      //   // TODO: Is this safe? Need to check the CopyFrom implementation. As it's a straight copy of bytes this
+      //   // hopefully can do it as one block instead of iterating and potentially doing a static_cast of each
+      //   // individual value.
+      //   tensor_value.mutable_longints()->mutable_values()->CopyFrom(
+      //       reinterpret_cast<const google::protobuf::RepeatedField<int64_t>&>(tensor_proto.uint64_data()));
+      // }
+
+      // break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL: {
       // from: int32_data/raw, to: bools
@@ -392,23 +400,28 @@ std::string GetModelOutputPath(bool create_ml_program) {
 }  // namespace
 
 ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger,
-                           int32_t coreml_version, uint32_t coreml_flags)
+                           int32_t coreml_version, uint32_t coreml_flags,
+                           std::vector<std::string>&& onnx_input_names,
+                           std::vector<std::string>&& onnx_output_names)
     : graph_viewer_(graph_viewer),
       logger_(logger),
       coreml_version_(coreml_version),
       coreml_flags_(coreml_flags),
       create_ml_program_((coreml_flags_ & COREML_FLAG_CREATE_MLPROGRAM) != 0),
       model_output_path_(GetModelOutputPath(create_ml_program_)),
+      onnx_input_names_(std::move(onnx_input_names)),
+      onnx_output_names_(std::move(onnx_output_names)),
       coreml_model_(std::make_unique<CoreML::Specification::Model>()) {
   if (create_ml_program_) {
 #if defined(COREML_ENABLE_MLPROGRAM)
     coreml_model_->set_specificationversion(CoreMLSpecVersion());
     MILSpec::Program& mlprogram = *coreml_model_->mutable_mlprogram();
-    MILSpec::Function& main = (*mlprogram.mutable_functions())["main"];
+    mlprogram.set_version(1);
+    mlprogram_main_fn_ = &(*mlprogram.mutable_functions())["main"];
 
     const std::string coreml_opset = "CoreML" + std::to_string(CoreMLVersion());
-    *main.mutable_opset() = coreml_opset;
-    mlprogram_main_ = &(*main.mutable_block_specializations())[coreml_opset];
+    *mlprogram_main_fn_->mutable_opset() = coreml_opset;
+    mlprogram_main_block_ = &(*mlprogram_main_fn_->mutable_block_specializations())[coreml_opset];
 
     // create the ModelPackage. this creates the output directory.
     mlpackage_ = std::make_unique<MPL::ModelPackage>(model_output_path_, /* create */ true);
@@ -426,6 +439,8 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
     weights_file_writer_ = std::make_unique<StorageWriter>(weights_info->path() + "/weight.bin");
 #else
     // should never happen due to handling in coreml_execution_provider.cc
+    // throw here so all other code in this class can assume create_ml_program_ is only ever true in a build
+    // where ML Program support is enabled.
     ORT_THROW("ML Program is not enabled in this build");
 #endif
   } else {
@@ -435,6 +450,28 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
     neural_network->set_arrayinputshapemapping(
         CoreML::Specification::NeuralNetworkMultiArrayShapeMapping::EXACT_ARRAY_MAPPING);
   }
+
+  // populate names.
+  const auto& initializers = graph_viewer_.GetAllInitializedTensors();
+  const auto& inputs = graph_viewer_.GetInputs();
+  // rough guess to try and avoid reallocs. most nodes produce one output but some have more so allow for that.
+  // also need to convert attributes to constants so allow for that
+  unique_names_.reserve(initializers.size() + inputs.size() + size_t(graph_viewer_.NumberOfNodes() * 1.5));
+  for (const auto& pair : initializers) {
+    unique_names_.insert(pair.first);
+  }
+
+  for (const auto* input : inputs) {
+    unique_names_.insert(input->Name());
+  }
+
+  for (const auto& node : graph_viewer_.Nodes()) {
+    for (const auto& def : node.OutputDefs()) {
+      if (def->Exists()) {
+        unique_names_.insert(def->Name());
+      }
+    }
+  }
 }
 
 ModelBuilder::~ModelBuilder() = default;
@@ -455,11 +492,94 @@ void ModelBuilder::AddLayer(std::unique_ptr<NeuralNetworkLayer> layer) {
   neural_network->mutable_layers()->AddAllocated(layer.release());
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
-
 /*
  * ML Program related helpers
  */
+#if defined(COREML_ENABLE_MLPROGRAM)
+const std::string& ModelBuilder::GetSafeName(const std::string& name) {
+  // Check the name is valid according to the MILSpec rules
+  // `Identifiers, generally used for names and keys, must match the regular expression [A-Za-z\_][A-Za-z0-9\_@]*.`
+  //
+  // There is a secondary list of reserved words that the coremltools python uses, but it's not clear if those are
+  // required here, or if we will ever hit a model that uses one of them. Due to that, skip checking them for now as
+  // it adds cost and code complexity
+  // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L151C1-L175C10
+  // static InlinedHashSet<std::string> reserved_names =
+  //    {"any", "bool", "program", "func", "tensor", "list", "dict", "tuple", "true", "false",
+  //     "string", "bf16", "fp16", "fp32", "fp64", "int8", "int16", "int32", "int64",
+  //     "uint8", "uint16", "uint32", "uint64"};
+
+  // handle empty name. shouldn't happen but code below assumes name is not empty
+  if (name.empty()) {
+    return name;
+  }
+
+  // We don't need '@' or '\' even though they're allowed. Optimize for a good name that does not need to be changed.
+
+  // has been sanitized and changed already
+  const auto entry = values_to_rename_.find(name);
+  if (entry != values_to_rename_.end()) {
+    return entry->second;
+  }
+
+  // Replace anything but a good char with '_'. If first char is 0-9 we prefix with '_';
+  bool changed = false;
+  std::string result = name;
+
+  if (std::isdigit(result[0])) {
+    changed = true;
+    result = '_' + name;
+  }
+
+  for (char& c : result) {
+    if (!std::isalnum(c) && c != '_') {
+      changed = true;
+      c = '_';
+    }
+  }
+
+  if (!changed) {
+    return name;  // return original as the return value is a reference that must remain valid
+  }
+
+  return (values_to_rename_[name] = GetUniqueName(result));
+}
+
+void ModelBuilder::SanitizeNames() {
+  // ML Model level inputs/outputs
+  auto* desc = coreml_model_->mutable_description();
+  for (auto& input : *desc->mutable_input()) {
+    input.set_name(GetSafeName(input.name()));
+  }
+
+  for (auto& output : *desc->mutable_output()) {
+    output.set_name(GetSafeName(output.name()));
+  }
+
+  // main function inputs/outputs.
+  for (auto& input : *mlprogram_main_fn_->mutable_inputs()) {
+    input.set_name(GetSafeName(input.name()));
+  }
+
+  // outputs from block with operations for current coreml version
+  for (auto& output : *mlprogram_main_block_->mutable_outputs()) {
+    output = GetSafeName(output);
+  }
+
+  // iterate operations changing input/output/node names
+  for (auto& op : *mlprogram_main_block_->mutable_operations()) {
+    for (auto& input : *op.mutable_inputs()) {
+      for (auto& arg : *input.second.mutable_arguments()) {
+        arg.set_name(GetSafeName(arg.name()));
+      }
+    }
+
+    for (auto& output : *op.mutable_outputs()) {
+      output.set_name(GetSafeName(output.name()));
+    }
+  }
+}
+
 std::unique_ptr<COREML_SPEC::MILSpec::Operation> ModelBuilder::CreateOperation(const Node& node,
                                                                                std::string_view op_type,
                                                                                std::string_view suffix) {
@@ -472,14 +592,9 @@ std::unique_ptr<COREML_SPEC::MILSpec::Operation> ModelBuilder::CreateOperation(c
   return op;
 }
 
-void ModelBuilder::AddConstant(std::string_view name, const ONNX_NAMESPACE::TensorProto& initializer) {
-  MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(initializer, *weights_file_writer_);
-  AddConstantOperation(name, std::move(coreml_tensor));
-}
-
-void ModelBuilder::AddConstantOperation(std::string_view name, MILSpec::Value&& coreml_tensor) {
+const std::string& ModelBuilder::AddConstantOperation(std::string_view name, MILSpec::Value&& coreml_tensor) {
   // Replicates coremltools/converters/mil/backend/mil/load.py translate_const logic
-  MILSpec::Operation& const_op = *mlprogram_main_->mutable_operations()->Add();
+  MILSpec::Operation& const_op = *mlprogram_main_block_->mutable_operations()->Add();
   const_op.set_type("const");
 
   MILSpec::NamedValueType& output = *const_op.mutable_outputs()->Add();
@@ -487,58 +602,63 @@ void ModelBuilder::AddConstantOperation(std::string_view name, MILSpec::Value&&
   *output.mutable_type() = coreml_tensor.type();
 
   auto& attr_map = *const_op.mutable_attributes();
-  attr_map["name"] = CreateScalarTensorValue(std::string(name));
+  // the operation name doesn't really matter as it isn't used elsewhere, so sanitize name now
+  attr_map["name"] = CreateScalarTensorValue(GetSafeName(output.name()));
   attr_map["val"] = std::move(coreml_tensor);
+
+  return output.name();
 }
 
 // Add operation to the Block for the main function in the ML Program
 void ModelBuilder::AddOperation(std::unique_ptr<COREML_SPEC::MILSpec::Operation> operation) {
-  mlprogram_main_->mutable_operations()->AddAllocated(operation.release());
+  mlprogram_main_block_->mutable_operations()->AddAllocated(operation.release());
 }
 
-std::string ModelBuilder::AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
-                                                            MILSpec::Value&& input_value) {
+const std::string& ModelBuilder::AddTensorValueAsConstantOperation(std::string_view op_type,
+                                                                   std::string_view value_type,
+                                                                   MILSpec::Value&& input_value) {
   auto unique_value_name = GetUniqueName(MakeString(op_type, "_", value_type));
-  AddConstantOperation(unique_value_name, std::move(input_value));
-  return unique_value_name;
+  return AddConstantOperation(unique_value_name, std::move(input_value));
 }
 
 template <typename T>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const T> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   // add specialization below
   static_assert(false_for_T<T>, "Missing specialization for value type");
-  return "";  // unreachable
+
+  return "ModelBuilder::AddConstant error";  // unreachable
 }
 
 template <>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
-                                          gsl::span<const float> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const float> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   auto input_value = CreateTensorValue<float>(value, shape);
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
 template <>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
-                                          gsl::span<const int64_t> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const int64_t> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   auto input_value = CreateTensorValue<int64_t, int32_t>(value, shape);  // CoreML uses int32
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
 template <>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
-                                          gsl::span<const bool> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const bool> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   auto input_value = CreateTensorValue<bool>(value, shape);
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
 template <>
-std::string ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
-                                          gsl::span<const std::string> value,
-                                          std::optional<gsl::span<const int64_t>> shape) {
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const std::string> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
   auto input_value = CreateTensorValue<std::string>(value, shape);
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
@@ -581,11 +701,13 @@ Status ModelBuilder::RegisterInitializers() {
       continue;
     }
 
-    if (create_ml_program_) {
 #if defined(COREML_ENABLE_MLPROGRAM)
-      AddConstant(name, tensor);
+    if (create_ml_program_) {
+      MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(tensor, *weights_file_writer_);
+      ORT_IGNORE_RETURN_VALUE(AddConstantOperation(name, std::move(coreml_tensor)));
+    } else
 #endif
-    } else {
+    {
       std::unique_ptr<NeuralNetworkLayer> layer = std::make_unique<NeuralNetworkLayer>();
       layer->set_name(GetUniqueName("initializer_" + name));
 
@@ -616,32 +738,33 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
   if (is_input) {
     // input should not be an initializer
-    if (Contains(GetInitializerTensors(), name))
+    if (Contains(GetInitializerTensors(), name)) {
       return Status::OK();
+    }
 
     // This input will not be used
-    if (Contains(skipped_inputs_, name))
+    if (Contains(skipped_inputs_, name)) {
       return Status::OK();
+    }
   }
 
   auto* model_description = coreml_model_->mutable_description();
-  auto& input_output = is_input
-                           ? *model_description->mutable_input()->Add()
-                           : *model_description->mutable_output()->Add();
+  auto& input_output = is_input ? *model_description->mutable_input()->Add()
+                                : *model_description->mutable_output()->Add();
 
   input_output.set_name(name);
+
   auto* multi_array = input_output.mutable_type()->mutable_multiarraytype();
 
   std::vector<int64_t> shape;
-  ORT_RETURN_IF_NOT(GetShape(node_arg, shape, logger_),
-                    "Unable to get shape for ", input_output_type, ": ", name);
+  ORT_RETURN_IF_NOT(GetShape(node_arg, shape, logger_), "Unable to get shape for ", input_output_type, ": ", name);
 
   if (shape.empty()) {
-    // If we have an empty shape, this is a scalar input,
-    // Since all the input output of CoreML EP is MultiArray, we will make the scalar input output as a {1} MultiArray
+    // If we have an empty shape, this is a scalar
+    // Since all the input/output of CoreML EP is MultiArray, we will make the scalar input/output a {1} MultiArray
     shape.push_back(1);
 
-    // we need to change the shapes of these scalar outputs back to {} when CoreML EP returns these values to ORT
+    // we need to change the shapes of scalar outputs back to {} when CoreML EP returns values to ORT
     if (!is_input) {
       AddScalarOutput(name);
     }
@@ -713,13 +836,20 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
-    MILSpec::Function& main = (*coreml_model_->mutable_mlprogram()->mutable_functions())["main"];
     if (is_input) {
-      // the model inputs need to be wired up as args to the 'main' function
-      main.mutable_inputs()->Add(CreateNamedTensorValueType(node_arg));
+      // the model inputs need to be wired up as args to the 'main' function.
+      auto tensor_value_type = CreateNamedTensorValueType(node_arg);
+      tensor_value_type.set_name(name);
+      if (node_arg.Shape()->dim_size() == 0) {
+        // update shape from {} to {1} (same change we made at the model input level above).
+        tensor_value_type.mutable_type()->mutable_tensortype()->set_rank(1);
+        tensor_value_type.mutable_type()->mutable_tensortype()->add_dimensions()->mutable_constant()->set_size(1);
+      }
+
+      mlprogram_main_fn_->mutable_inputs()->Add(std::move(tensor_value_type));
     } else {
       // the model outputs need to be set as outputs of the Block for the 'main' function
-      *mlprogram_main_->mutable_outputs()->Add() = node_arg.Name();
+      *mlprogram_main_block_->mutable_outputs()->Add() = name;
     }
   }
 #endif  // defined(COREML_ENABLE_MLPROGRAM)
@@ -744,7 +874,7 @@ Status ModelBuilder::ProcessNodes() {
       // This shouldn't happen as this is called from CoreMLExecutionProvider::Compile and should only be processing
       // nodes that we said were supported and were returned from CoreMLExecutionProvider::GetCapability.
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Node [", node.Name(), "], type [", node.OpType(), "] is not supported");
+                             "Node [", node.Name(), "], type [", node.OpType(), "] was not able to be processed");
     }
   }
 
@@ -767,6 +897,12 @@ Status ModelBuilder::CreateModel() {
   ORT_RETURN_IF_ERROR(ProcessNodes());
   ORT_RETURN_IF_ERROR(RegisterModelOutputs());
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (create_ml_program_) {
+    SanitizeNames();
+  }
+#endif
+
   return Status::OK();
 }
 
@@ -795,7 +931,7 @@ Status ModelBuilder::SaveModel() {
 #if defined(COREML_ENABLE_MLPROGRAM)
   // need to delete the ModelPackage instance for it to write out the manifest. clear out the other ML Program
   // related types as well.
-  mlprogram_main_ = nullptr;
+  mlprogram_main_block_ = nullptr;
   mlpackage_.reset();
   weights_file_writer_.reset();
 #endif
@@ -804,11 +940,51 @@ Status ModelBuilder::SaveModel() {
 }
 
 Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
-  model = std::make_unique<Model>(model_output_path_,
-                                  std::move(input_output_info_),
-                                  std::move(scalar_outputs_),
-                                  std::move(int64_outputs_),
-                                  logger_, coreml_flags_);
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (create_ml_program_) {
+    // we need to provide the sanitized names for model inputs/outputs so that info is captured.
+    // the input/output matching when we execute the model from the CoreML EP is based on order, so the change
+    // to the names doesn't matter for that.
+    auto get_sanitized_names = [this](std::vector<std::string>&& names) -> std::vector<std::string> {
+      std::vector<std::string> output(std::move(names));
+
+      for (std::string& name : output) {
+        name = GetSafeName(name);
+      }
+
+      return output;
+    };
+
+    // also need to update the keys in input_output_info_
+    auto get_sanitized_io_info = [this](std::unordered_map<std::string, OnnxTensorInfo>&& info) {
+      std::unordered_map<std::string, OnnxTensorInfo> output;
+      output.reserve(info.size());
+
+      for (auto entry = info.begin(), end = info.end(); entry != end; ++entry) {
+        output.emplace(GetSafeName(entry->first), std::move(entry->second));
+      }
+
+      return output;
+    };
+
+    model = std::make_unique<Model>(model_output_path_,
+                                    get_sanitized_names(std::move(onnx_input_names_)),
+                                    get_sanitized_names(std::move(onnx_output_names_)),
+                                    get_sanitized_io_info(std::move(input_output_info_)),
+                                    std::move(scalar_outputs_),
+                                    std::move(int64_outputs_),
+                                    logger_, coreml_flags_);
+  } else
+#endif
+  {
+    model = std::make_unique<Model>(model_output_path_,
+                                    std::move(onnx_input_names_),
+                                    std::move(onnx_output_names_),
+                                    std::move(input_output_info_),
+                                    std::move(scalar_outputs_),
+                                    std::move(int64_outputs_),
+                                    logger_, coreml_flags_);
+  }
 
   return model->LoadModel();  // load using CoreML API, including compilation
 }
@@ -816,8 +992,11 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
 // static
 Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
                            int32_t coreml_version, uint32_t coreml_flags,
+                           std::vector<std::string>&& onnx_input_names,
+                           std::vector<std::string>&& onnx_output_names,
                            std::unique_ptr<Model>& model) {
-  ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_flags);
+  ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_flags,
+                       std::move(onnx_input_names), std::move(onnx_output_names));
 
   ORT_RETURN_IF_ERROR(builder.CreateModel());
   ORT_RETURN_IF_ERROR(builder.SaveModel());
@@ -847,20 +1026,31 @@ void ModelBuilder::AddInputToSkip(const std::string& input_name) {
   skipped_inputs_.insert(input_name);
 }
 
-std::string ModelBuilder::GetUniqueName(std::string_view base_name) {
+const std::string& ModelBuilder::GetUniqueName(const std::string& base_name) {
+  if (unique_names_.find(base_name) == unique_names_.end()) {
+    return *unique_names_.insert(base_name).first;
+  }
+
   std::string unique_name;
-  do {
-    std::ostringstream os;
-    os << base_name << "_token_" << name_token_++;
-    unique_name = os.str();
-  } while (Contains(unique_names_, unique_name));
+  std::string suffix;
+
+  // supports up to 1000 unique names without having to grow in the loop
+  unique_name.reserve(base_name.size() + 5);
+  unique_name = base_name;
+
+  while (Contains(unique_names_, unique_name)) {
+    // assign followed by += to avoid creating temporary strings.
+    unique_name = base_name;
+    unique_name += "__";
+    unique_name += std::to_string(name_token_++);
+  }
 
-  return unique_name;
+  return *unique_names_.insert(unique_name).first;
 }
 
-std::string ModelBuilder::GetUniqueName(const Node& node, std::string_view suffix) {
+const std::string& ModelBuilder::GetUniqueName(const Node& node, std::string_view suffix) {
   if (node.Name().empty()) {
-    return GetUniqueName(MakeString("Node_", node.Index(), "_", node.OpType(), suffix));
+    return GetUniqueName(MakeString(node.OpType(), "_", node.Index(), suffix));
   } else {
     return GetUniqueName(node.Name() + std::string(suffix));
   }
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 961ba647257b5..8f85ab2c09e7c 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -25,17 +25,20 @@ namespace onnxruntime {
 namespace coreml {
 
 class IOpBuilder;
-class Model;
 
 class ModelBuilder {
  private:
   ModelBuilder(const GraphViewer& graph_viewer, const logging::Logger& logger,
-               int32_t coreml_version, uint32_t coreml_flags);
+               int32_t coreml_version, uint32_t coreml_flags,
+               std::vector<std::string>&& onnx_input_names,
+               std::vector<std::string>&& onnx_output_names);
 
  public:
   // Create the CoreML model, serialize to disk, load and compile using the CoreML API and return in `model`
   static Status Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
                       int32_t coreml_version, uint32_t coreml_flags,
+                      std::vector<std::string>&& onnx_input_names,
+                      std::vector<std::string>&& onnx_output_names,
                       std::unique_ptr<Model>& model);
 
   ~ModelBuilder();
@@ -101,8 +104,8 @@ class ModelBuilder {
   /// </param>
   /// <returns>Unique name generated for value.</returns>
   template <typename T>
-  std::string AddConstant(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
-                          std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
+  std::string_view AddConstant(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
+                               std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
     static_assert(std::is_same_v<T, float> ||
                       std::is_same_v<T, int64_t> ||
                       std::is_same_v<T, std::string> ||
@@ -113,8 +116,8 @@ class ModelBuilder {
   }
 
   template <typename T>
-  std::string AddConstant(std::string_view op_type, std::string_view value_type, const std::vector<T>& value,
-                          std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
+  std::string_view AddConstant(std::string_view op_type, std::string_view value_type, const std::vector<T>& value,
+                               std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
     return AddConstant(op_type, value_type, AsSpan(value), shape);
   }
 
@@ -122,17 +125,10 @@ class ModelBuilder {
   /// Add a scalar value as a 'const' operation. See AddConstant for details.
   /// </summary>
   template <typename T>
-  std::string AddScalarConstant(std::string_view op_type, std::string_view value_type, const T& value) {
+  std::string_view AddScalarConstant(std::string_view op_type, std::string_view value_type, const T& value) {
     return AddConstant(op_type, value_type, AsSpan({value}), AsSpan<const int64_t>({}));
   }
 
-  /// <summary>
-  /// Add an existing a constant ONNX initializer to the ML Program as a 'const' operation
-  /// </summary>
-  /// <param name="name">Initializer name</param>
-  /// <param name="initializer">Initializer data</param>
-  void AddConstant(std::string_view name, const ONNX_NAMESPACE::TensorProto& initializer);
-
   // add the operation to the main function
   void AddOperation(std::unique_ptr<COREML_SPEC::MILSpec::Operation> operation);
 #endif
@@ -149,18 +145,26 @@ class ModelBuilder {
   // be added to CoreML model, since CoreML does not like input unused
   void AddInputToSkip(const std::string& input_name);
 
-  std::string GetUniqueName(std::string_view base_name);
-  std::string GetUniqueName(const Node& node, std::string_view suffix);
+  const std::string& GetUniqueName(const std::string& base_name);
+  const std::string& GetUniqueName(const Node& node, std::string_view suffix);
+
+  const logging::Logger& Logger() const { return logger_; }
 
  private:
 #if defined(COREML_ENABLE_MLPROGRAM)
   template <typename T>
-  std::string AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
-                              std::optional<gsl::span<const int64_t>> shape = std::nullopt);
-
-  void AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer);
-  std::string AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
-                                                COREML_SPEC::MILSpec::Value&& input_value);
+  std::string_view AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
+                                   std::optional<gsl::span<const int64_t>> shape = std::nullopt);
+
+  // apply the CoreML naming rules and fix any invalid names.
+  const std::string& GetSafeName(const std::string& name);
+  // sanitize all the names in the ML Model
+  void SanitizeNames();
+
+  // add Value as a const operation. return value name in case sanitization changed it
+  const std::string& AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer);
+  const std::string& AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
+                                                       COREML_SPEC::MILSpec::Value&& input_value);
 #endif
 
   // Convert the ONNX model in graph_viewer_ to a CoreML::Specification::Model and serialize to disk.
@@ -193,6 +197,9 @@ class ModelBuilder {
   const bool create_ml_program_;         // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
   const std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
 
+  std::vector<std::string> onnx_input_names_;
+  std::vector<std::string> onnx_output_names_;
+
   std::unique_ptr<CoreML::Specification::Model> coreml_model_;
   std::unordered_set<std::string> scalar_outputs_;
   std::unordered_set<std::string> int64_outputs_;
@@ -208,9 +215,19 @@ class ModelBuilder {
   // mlprogram_main_ is the main block of the CoreML ML Program.
   // It is set in CreateModel to the CoreML Model.mlprogram.functions['main'].block_specializations['CoreML<ver>']
   // entry we create.
-  COREML_SPEC::MILSpec::Block* mlprogram_main_{nullptr};
+  COREML_SPEC::MILSpec::Function* mlprogram_main_fn_{nullptr};  // Function that contains a Block with the operations
+  COREML_SPEC::MILSpec::Block* mlprogram_main_block_{nullptr};  // Block that all the operations are added to
   std::unique_ptr<MPL::ModelPackage> mlpackage_;
   std::unique_ptr<MILBlob::Blob::StorageWriter> weights_file_writer_;
+
+  // Values must start with [a-zA-A_]
+  // Additionally they can't be in a list of reserved words.
+  // If we need to sanitize an initializer name we do so during PreprocessInitializers and apply the change during
+  // RegisterInitializers.
+  // We also check inputs in AddOperation and apply the change there.
+  // This means an op builder author doesn't need to be aware of the renaming.
+  // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L146-L149
+  std::unordered_map<std::string, std::string> values_to_rename_;
 #endif
 };
 
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index 8e718da07703c..0ba715cc7c6d9 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -114,28 +114,27 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
                                                 std::vector<NodeComputeInfo>& node_compute_funcs) {
   for (const auto& fused_node_and_graph : fused_nodes_and_graphs) {
     Node& fused_node = fused_node_and_graph.fused_node;
-    const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
 
     std::unique_ptr<coreml::Model> coreml_model;
-    ORT_RETURN_IF_ERROR(coreml::ModelBuilder::Build(graph_viewer, *GetLogger(), coreml_version_, coreml_flags_,
-                                                    coreml_model));
-
     {
-      const auto& input_defs = fused_node.InputDefs();
-      std::vector<std::string> onnx_input_names(input_defs.size());
-      for (size_t i = 0, end = input_defs.size(); i < end; ++i) {
-        onnx_input_names[i] = input_defs[i]->Name();
-      }
-      coreml_model->SetOnnxInputs(std::move(onnx_input_names));
-    }
+      auto get_names = [](const ConstPointerContainer<std::vector<NodeArg*>>& args) -> std::vector<std::string> {
+        std::vector<std::string> names;
+        names.reserve(args.size());
 
-    {
-      const auto& output_defs = fused_node.OutputDefs();
-      std::vector<std::string> onnx_output_names(output_defs.size());
-      for (size_t i = 0, end = output_defs.size(); i < end; ++i) {
-        onnx_output_names[i] = output_defs[i]->Name();
-      }
-      coreml_model->SetOnnxOutputs(std::move(onnx_output_names));
+        for (const NodeArg* def : args) {
+          names.push_back(def->Name());
+        }
+
+        return names;
+      };
+
+      std::vector<std::string> onnx_input_names = get_names(fused_node.InputDefs());
+      std::vector<std::string> onnx_output_names = get_names(fused_node.OutputDefs());
+
+      const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
+      ORT_RETURN_IF_ERROR(coreml::ModelBuilder::Build(graph_viewer, *GetLogger(), coreml_version_, coreml_flags_,
+                                                      std::move(onnx_input_names), std::move(onnx_output_names),
+                                                      coreml_model));
     }
 
     coreml_models_.emplace(fused_node.Name(), std::move(coreml_model));
@@ -153,13 +152,14 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
     compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) {
       Ort::KernelContext ctx(context);
-
       const size_t num_inputs = ctx.GetInputCount();
       const size_t num_outputs = ctx.GetOutputCount();
 
       coreml::Model* model = reinterpret_cast<coreml::Model*>(state);
-      const auto& model_inputs = model->GetOnnxInputs();
-      const auto& model_outputs = model->GetOnnxOutputs();
+
+      // input/output names used by the CoreML model in the order that matches the fused_node InputDefs/OutputDefs
+      const auto& model_inputs = model->GetOrderedInputs();
+      const auto& model_outputs = model->GetOrderedOutputs();
 
       ORT_RETURN_IF_NOT(model_inputs.size() <= num_inputs, "Inconsistent input sizes");
       ORT_RETURN_IF_NOT(model_outputs.size() == num_outputs, "Inconsistent output sizes");
@@ -182,28 +182,25 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
         // Disallow inputs with dynamic shape which actually have zero elements.
         // CoreML doesn't consistently handle this well (e.g., there may be runtime errors).
-        {
-          const auto& inferred_shape = input_info->shape;
-          ORT_RETURN_IF(!coreml::IsStaticShape(inferred_shape) && coreml::DoesShapeSpecifyZeroElements(shape),
-                        "Input (", input_name, ") has a dynamic shape (", coreml::Shape2String(inferred_shape),
-                        ") but the runtime shape (", coreml::Shape2String(shape),
-                        ") has zero elements. This is not supported by the CoreML EP.");
-        }
+        const auto& inferred_shape = input_info->shape;
+        ORT_RETURN_IF(!coreml::IsStaticShape(inferred_shape) && coreml::DoesShapeSpecifyZeroElements(shape),
+                      "Input (", input_name, ") has a dynamic shape (", coreml::Shape2String(inferred_shape),
+                      ") but the runtime shape (", coreml::Shape2String(shape),
+                      ") has zero elements. This is not supported by the CoreML EP.");
 
         // If we have an empty shape, this is a scalar input,
         // Since all the input output of CoreML EP is MultiArray, we will make the scalar input as a {1} MultiArray
-        if (shape.empty())
+        if (shape.empty()) {
           shape.push_back(1);
+        }
 
         // CoreML MLMultiArray API expect input to be non-const
         // https://developer.apple.com/documentation/coreml/mlmultiarray/2881219-initwithdatapointer?language=objc
         void* inputBuffer = const_cast<void*>(input_tensor.GetTensorRawData());
-        inputs.emplace(
-            input_name,
-            coreml::OnnxTensorData{
-                coreml::OnnxTensorInfo{tensor_info.GetElementType(), shape},
-                inputBuffer,
-            });
+        inputs.emplace(input_name, coreml::OnnxTensorData{
+                                       coreml::OnnxTensorInfo{tensor_info.GetElementType(), shape},
+                                       inputBuffer,
+                                   });
       }
 
       // From this point we will need to take the exclusive lock on the model until the Predict is
@@ -215,14 +212,13 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
         outputs.reserve(model_outputs.size());
 
         coreml::GetOutputTensorMutableRawDataFn get_output_tensor_mutable_raw_data_fn =
-            [&ctx, &model_outputs](
-                const std::string& name,
-                int32_t requested_onnx_tensor_element_type,
-                gsl::span<const int64_t> static_shape) -> void* {
+            [&ctx, &model_outputs](const std::string& name,
+                                   int32_t requested_onnx_tensor_element_type,
+                                   gsl::span<const int64_t> static_shape) -> void* {
           const auto model_output_it = std::find(model_outputs.begin(), model_outputs.end(), name);
           ORT_ENFORCE(model_output_it != model_outputs.end(), "Failed to find CoreML model output name: ", name);
-          const auto output_idx = gsl::narrow_cast<size_t>(std::distance(model_outputs.begin(), model_output_it));
 
+          const auto output_idx = gsl::narrow_cast<size_t>(std::distance(model_outputs.begin(), model_output_it));
           auto output_tensor = ctx.GetOutput(output_idx, static_shape.data(), static_shape.size());
 
           const auto type_and_shape_info = output_tensor.GetTensorTypeAndShapeInfo();
@@ -243,13 +239,15 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector<FusedNodeAndGr
 
           // Since CoreML EP use {1} MLMultiArray as scalar, if the model output should have empty shape
           // We are going to replace the {1} shape of the output back to {}
-          if (model->IsScalarOutput(output_name))
+          if (model->IsScalarOutput(output_name)) {
             output_shape.clear();
+          }
 
           // Since CoreML EP only accepts int32 output type and onnx requires int64 output,
           // We are going to set the model output (from int32) ->int64
-          if (model->IsInt64Output(output_name))
+          if (model->IsInt64Output(output_name)) {
             output_type = ONNX_NAMESPACE::TensorProto_DataType_INT64;
+          }
 
           outputs.emplace(output_name, coreml::OnnxTensorInfo{output_type, output_shape});
         }
diff --git a/onnxruntime/core/providers/coreml/dump_mlprogram_model.py b/onnxruntime/core/providers/coreml/dump_mlprogram_model.py
new file mode 100644
index 0000000000000..a3ceee70684dc
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/dump_mlprogram_model.py
@@ -0,0 +1,27 @@
+import sys
+
+import coremltools as ct
+
+if len(sys.argv) < 2:
+    print(f"Usage: {sys.argv[0]} <path to model.mlmodel in ML Package>")
+    print("If generated by onnxruntime this will be <ML Package root>/Data/com.microsoft.onnxruntime/model.mlmodel")
+    sys.exit(-1)
+
+model_path = sys.argv[1]
+m = ct.models.MLModel(model_path)
+
+spec = m.get_spec()
+print(spec)
+
+# Example code if you want to filter output or do more advanced things
+# main = spec.mlProgram.functions["main"]
+# block = main.block_specializations[main.opset]
+# print(f"{len(block.operations)} operators")
+# for op in block.operations:
+# if op.type == 'const':
+#     if op.attributes["name"].immediateValue.tensor.strings.values[0] == "conv_0_pad_type_0":
+#         print(f"Conv pad_type={op.attributes['val'].immediateValue.tensor.strings.values}")
+#
+# if op.type == 'conv':
+#     #print(op)
+#     pass
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h
index 4f9a014c4d885..a9991ccb945ce 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.h
+++ b/onnxruntime/core/providers/coreml/model/host_utils.h
@@ -67,6 +67,12 @@ int CoreMLVersion();
 // Get a temporary macOS/iOS temp file path
 std::string GetTemporaryFilePath();
 
+#if !defined(NDEBUG) && defined(__APPLE__)
+// Override location the model is written to so that a) it's easily found and b) it is not automatically deleted
+// when the EP exits. Use to debug the model that is generated.
+// See onnxruntime/core/providers/coreml/dump_mlprogram_model.py for a script to dump the ML Program.
+constexpr const char* kOverrideModelOutputDirectoryEnvVar = "ORT_COREML_EP_MODEL_DIR";
+#endif
 }  // namespace util
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.mm b/onnxruntime/core/providers/coreml/model/host_utils.mm
index 0ae0cf8f0d207..5487ea35388f5 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.mm
+++ b/onnxruntime/core/providers/coreml/model/host_utils.mm
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/platform/env.h"
 #include "core/providers/coreml/model/host_utils.h"
 
 #import <Foundation/Foundation.h>
@@ -31,6 +32,15 @@ int32_t CoreMLVersion() {
 std::string GetTemporaryFilePath() {
   // Get temporary directory for user.
   NSURL* temporary_directory_url = [NSURL fileURLWithPath:NSTemporaryDirectory() isDirectory:YES];
+
+#if !defined(NDEBUG)
+  std::string path_override = Env::Default().GetEnvironmentVar(kOverrideModelOutputDirectoryEnvVar);
+  if (!path_override.empty()) {
+    NSString* ns_path_override = [NSString stringWithUTF8String:path_override.c_str()];
+    temporary_directory_url = [NSURL fileURLWithPath:ns_path_override isDirectory:YES];
+  }
+#endif
+
   // Generate a Unique file name to use.
   NSString* temporary_filename = [[NSProcessInfo processInfo] globallyUniqueString];
 
diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h
index b940c4b768aec..e3cd43d786fc3 100644
--- a/onnxruntime/core/providers/coreml/model/model.h
+++ b/onnxruntime/core/providers/coreml/model/model.h
@@ -35,6 +35,8 @@ using GetOutputTensorMutableRawDataFn = std::function<void*(const std::string& n
 class Model {
  public:
   Model(const std::string& path,
+        std::vector<std::string>&& model_input_names,
+        std::vector<std::string>&& model_output_names,
         std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
         std::unordered_set<std::string>&& scalar_outputs,
         std::unordered_set<std::string>&& int64_outputs,
@@ -60,12 +62,11 @@ class Model {
   // Mutex for exclusive lock to this model object
   OrtMutex& GetMutex() { return mutex_; }
 
-  // Input and output names in the onnx model's order
-  const std::vector<std::string>& GetOnnxInputs() const { return onnx_inputs_; }
-  void SetOnnxInputs(std::vector<std::string>&& inputs) { onnx_inputs_ = std::move(inputs); }
-
-  const std::vector<std::string>& GetOnnxOutputs() const { return onnx_outputs_; }
-  void SetOnnxOutputs(std::vector<std::string>&& outputs) { onnx_outputs_ = std::move(outputs); }
+  // Input and output names in the ORT fused node's order.
+  // Names may have been adjusted from the originals due to CoreML naming rules.
+  // We do inputs/outputs based on order at the ONNX level so this doesn't matter.
+  const std::vector<std::string>& GetOrderedInputs() const { return model_input_names_; }
+  const std::vector<std::string>& GetOrderedOutputs() const { return model_output_names_; }
 
   const OnnxTensorInfo* TryGetInputOutputInfo(const std::string& name) const {
     const auto info_it = input_output_info_.find(name);
@@ -80,13 +81,13 @@ class Model {
 
  private:
   std::unique_ptr<Execution> execution_;
+  std::vector<std::string> model_input_names_;   // input names in the order of the ORT fused node's inputs
+  std::vector<std::string> model_output_names_;  // output names in the order of the ORT fused node's outputs
+
   std::unordered_map<std::string, OnnxTensorInfo> input_output_info_;
   std::unordered_set<std::string> scalar_outputs_;
   std::unordered_set<std::string> int64_outputs_;
 
-  std::vector<std::string> onnx_inputs_;
-  std::vector<std::string> onnx_outputs_;
-
   OrtMutex mutex_;
 };
 
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index d5cd70bff9479..1434043e064f4 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -19,6 +19,7 @@
 #include "core/common/narrow.h"
 #include "core/common/span_utils.h"
 #include "core/graph/onnx_protobuf.h"
+#include "core/platform/env.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/coreml_provider_factory.h"
 #include "core/providers/coreml/model/host_utils.h"
@@ -287,6 +288,14 @@ - (void)cleanup {
     compiled_model_path_ = nil;
   }
 
+#if !defined(NDEBUG)
+  std::string path_override = Env::Default().GetEnvironmentVar(util::kOverrideModelOutputDirectoryEnvVar);
+  if (!path_override.empty()) {
+    // don't cleanup
+    coreml_model_path_ = nil;
+  }
+#endif
+
   if (coreml_model_path_ != nil) {
     error = nil;
     [[NSFileManager defaultManager] removeItemAtPath:coreml_model_path_ error:&error];
@@ -487,12 +496,16 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
 }
 
 Model::Model(const std::string& path,
+             std::vector<std::string>&& model_input_names,
+             std::vector<std::string>&& model_output_names,
              std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
              std::unordered_set<std::string>&& scalar_outputs,
              std::unordered_set<std::string>&& int64_outputs,
              const logging::Logger& logger,
              uint32_t coreml_flags)
     : execution_(std::make_unique<Execution>(path, logger, coreml_flags)),
+      model_input_names_(std::move(model_input_names)),
+      model_output_names_(std::move(model_output_names)),
       input_output_info_(std::move(input_output_info)),
       scalar_outputs_(std::move(scalar_outputs)),
       int64_outputs_(std::move(int64_outputs)) {
diff --git a/onnxruntime/core/providers/coreml/model/model_stub.cc b/onnxruntime/core/providers/coreml/model/model_stub.cc
index 087c9f8c05d5f..c6f2e7401ea1e 100644
--- a/onnxruntime/core/providers/coreml/model/model_stub.cc
+++ b/onnxruntime/core/providers/coreml/model/model_stub.cc
@@ -9,12 +9,16 @@ namespace coreml {
 class Execution {};
 
 Model::Model(const std::string& /*path*/,
+             std::vector<std::string>&& model_input_names,
+             std::vector<std::string>&& model_output_names,
              std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
              std::unordered_set<std::string>&& scalar_outputs,
              std::unordered_set<std::string>&& int64_outputs,
              const logging::Logger& /*logger*/,
              uint32_t /*coreml_flags*/)
     : execution_(std::make_unique<Execution>()),
+      model_input_names_(std::move(model_input_names)),
+      model_output_names_(std::move(model_output_names)),
       input_output_info_(std::move(input_output_info)),
       scalar_outputs_(std::move(scalar_outputs)),
       int64_outputs_(std::move(int64_outputs)) {
diff --git a/onnxruntime/core/providers/cpu/tensor/reshape_helper.h b/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
index 5961686674424..d7ceda16e61ea 100644
--- a/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
+++ b/onnxruntime/core/providers/cpu/tensor/reshape_helper.h
@@ -37,12 +37,14 @@ class ReshapeHelper {
     if (unknown_dim != -1) {
       // calculate unknown dimension
       ORT_ENFORCE(size != 0 && (input_shape_size % size) == 0,
-                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape, ", requested shape:", TensorShape(requested_shape));
+                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape,
+                  ", requested shape:", TensorShape(requested_shape));
       requested_shape[unknown_dim] = input_shape_size / size;
     } else {
       // check if the output shape is valid.
       ORT_ENFORCE(input_shape_size == size,
-                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape, ", requested shape:", TensorShape(requested_shape));
+                  "The input tensor cannot be reshaped to the requested shape. Input shape:", input_shape,
+                  ", requested shape:", TensorShape(requested_shape));
     }
   }
 };
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 7d4111e3b9c39..729ad34368453 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -64,17 +64,22 @@ namespace perftest {
       "\t    Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
       "\t    [Example] -C \"session.disable_cpu_ep_fallback|1 ep.context_enable|1\" \n"
       "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
+      "\t    [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n"
+      "\n"
       "\t    [DML only] [performance_preference]: DML device performance preference, options: 'default', 'minimum_power', 'high_performance', \n"
       "\t    [DML only] [device_filter]: DML device filter, options: 'any', 'gpu', 'npu', \n"
       "\t    [DML only] [disable_metacommands]: Options: 'true', 'false', \n"
       "\t    [DML only] [enable_dynamic_graph_fusion]: Options: 'true', 'false', \n"
       "\t    [DML only] [enable_graph_serialization]: Options: 'true', 'false', \n"
+      "\n"
       "\t    [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
       "\t    [OpenVINO only] [device_id]: Selects a particular hardware device for inference.\n"
       "\t    [OpenVINO only] [enable_npu_fast_compile]: Optionally enabled to speeds up the model's compilation on NPU device targets.\n"
       "\t    [OpenVINO only] [num_of_threads]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
       "\t    [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
       "\t    [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
+      "\t    [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
+      "\n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options: 'basic', 'detailed', default 'off'.\n"
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
@@ -89,9 +94,8 @@ namespace perftest {
       "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
       "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
       "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
-      "\t [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU_FP32 enable_npu_fast_compile|true num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
-      "\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n\n"
+      "\t    [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n"
+      "\n"
       "\t    [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
       "\t    [TensorRT only] [trt_min_subgraph_size]: Minimum size of TensorRT subgraphs.\n"
       "\t    [TensorRT only] [trt_max_workspace_size]: Set TensorRT maximum workspace size in byte.\n"
@@ -108,20 +112,23 @@ namespace perftest {
       "\t    [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
       "\t    [TensorRT only] [trt_context_memory_sharing_enable]: Enable TensorRT context memory sharing between subgraphs.\n"
       "\t    [TensorRT only] [trt_layer_norm_fp32_fallback]: Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow.\n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>'\n\n"
-      "\t [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
+      "\t    [Example] [For TensorRT EP] -e tensorrt -i 'trt_fp16_enable|true trt_int8_enable|true trt_int8_calibration_table_name|calibration.flatbuffers trt_int8_use_native_calibration_table|false trt_force_sequential_engine_build|false'\n"
+      "\n"
       "\t    [NNAPI only] [NNAPI_FLAG_USE_FP16]: Use fp16 relaxation in NNAPI EP..\n"
       "\t    [NNAPI only] [NNAPI_FLAG_USE_NCHW]: Use the NCHW layout in NNAPI EP.\n"
       "\t    [NNAPI only] [NNAPI_FLAG_CPU_DISABLED]: Prevent NNAPI from using CPU devices.\n"
       "\t    [NNAPI only] [NNAPI_FLAG_CPU_ONLY]: Using CPU only in NNAPI EP.\n"
-      "\t [Usage]: -e <provider_name> -i '<key1> <key2>'\n\n"
-      "\t [Example] [For NNAPI EP] -e nnapi -i \" NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED \"\n"
+      "\t    [Example] [For NNAPI EP] -e nnapi -i \"NNAPI_FLAG_USE_FP16 NNAPI_FLAG_USE_NCHW NNAPI_FLAG_CPU_DISABLED\"\n"
+      "\n"
+      "\t    [CoreML only] [COREML_FLAG_CREATE_MLPROGRAM]: Create an ML Program model instead of Neural Network.\n"
+      "\t    [Example] [For CoreML EP] -e coreml -i \"COREML_FLAG_CREATE_MLPROGRAM\"\n"
+      "\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
       "\t    [SNPE only] [priority]: execution priority, options: 'low', 'normal'. \n"
       "\t    [SNPE only] [buffer_type]: options: 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. default: ITENSOR'. \n"
       "\t    [SNPE only] [enable_init_cache]: enable SNPE init caching feature, set to 1 to enabled it. Disabled by default. \n"
-      "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
-      "\t [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
+      "\t    [Example] [For SNPE EP] -e snpe -i \"runtime|CPU priority|low\" \n\n"
+      "\n"
       "\t-T [Set intra op thread affinities]: Specify intra op thread affinity string\n"
       "\t [Example]: -T 1,2;3,4;5,6 or -T 1-2;3-4;5-6 \n"
       "\t\t Use semicolon to separate configuration between threads.\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 1934314b8ce43..9679ca6159464 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -468,7 +468,10 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         nnapi_flags |= NNAPI_FLAG_CPU_ONLY;
       } else if (key.empty()) {
       } else {
-        ORT_THROW("[ERROR] [NNAPI] wrong key type entered. Choose from the following runtime key options that are available for NNAPI. ['NNAPI_FLAG_USE_FP16', 'NNAPI_FLAG_USE_NCHW', 'NNAPI_FLAG_CPU_DISABLED', 'NNAPI_FLAG_CPU_ONLY'] \n");
+        ORT_THROW(
+            "[ERROR] [NNAPI] wrong key type entered. Choose from the following runtime key options "
+            "that are available for NNAPI. "
+            "['NNAPI_FLAG_USE_FP16', 'NNAPI_FLAG_USE_NCHW', 'NNAPI_FLAG_CPU_DISABLED', 'NNAPI_FLAG_CPU_ONLY'] \n");
       }
     }
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(session_options, nnapi_flags));
@@ -476,10 +479,31 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     ORT_THROW("NNAPI is not supported in this build\n");
 #endif
   } else if (provider_name_ == onnxruntime::kCoreMLExecutionProvider) {
+#ifdef __APPLE__
 #ifdef USE_COREML
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, 0));
+    uint32_t coreml_flags = 0;
+    std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
+    std::istringstream ss(ov_string);
+
+    std::string key;
+    while (ss >> key) {
+      if (key == "COREML_FLAG_CREATE_MLPROGRAM") {
+        coreml_flags |= COREML_FLAG_CREATE_MLPROGRAM;
+        std::cout << "Enabling ML Program.\n";
+      } else if (key.empty()) {
+      } else {
+        ORT_THROW(
+            "[ERROR] [CoreML] wrong key type entered. Choose from the following runtime key options "
+            "that are available for CoreML. ['COREML_FLAG_CREATE_MLPROGRAM'] \n");
+      }
+    }
+    // COREML_FLAG_CREATE_MLPROGRAM
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(session_options, coreml_flags));
+#else
+    ORT_THROW("CoreML is not supported in this build\n");
+#endif
 #else
-    ORT_THROW("COREML is not supported in this build\n");
+    ORT_THROW("COREML is not supported on this platform.\n");
 #endif
   } else if (provider_name_ == onnxruntime::kDmlExecutionProvider) {
 #ifdef USE_DML
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 7b6f1b9244be9..94817158017bd 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -192,5 +192,25 @@ TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) {
 #endif
 }
 
+// Test that we fix invalid names in model inputs, initializers and outputs.
+// Names in CoreML cannot start with [0-9] or contain anything but "[a-z][A-Z][0-9]_"
+TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
+  OpTester test("Clip", 11);
+
+  std::vector<int64_t> dims{3, 3};
+  test.AddInput<float>("0", dims,
+                       {-1.0f, 0.0f, 1.0f,
+                        -6.0f, 0.0f, 6.0f,
+                        -5.4f, 2.0f, 6.0f});
+  test.AddInput<float>("1.min", {}, {-5}, true);  // add as initializers
+  test.AddInput<float>("2/max", {}, {5}, true);
+  test.AddOutput<float>("3", dims,
+                        {-1.0f, 0.0f, 1.0f,
+                         -5.0f, 0.0f, 5.0f,
+                         -5.0f, 2.0f, 5.0f});
+
+  // TensorRT does not support Clip opset 11 yet.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc
index efb46e86d04e4..b5d5f84df950a 100644
--- a/onnxruntime/test/providers/cpu/math/clip_test.cc
+++ b/onnxruntime/test/providers/cpu/math/clip_test.cc
@@ -182,7 +182,7 @@ TEST(MathOpTest, Clip) {
   run_test(true);
 }
 
-// Use clip between [0, 6] as Relu6 (for some EPs, such as NNAPI)
+// Use clip between [0, 6] as Relu6 to test optimized path in some  EPs, such as NNAPI and CoreML
 TEST(MathOpTest, Clip_Relu6) {
   // To test NNAPI EP, we need the min/max to be in initializers
   auto run_test = [](bool min_max_are_initializer) {
@@ -208,6 +208,31 @@ TEST(MathOpTest, Clip_Relu6) {
   run_test(true);
 }
 
+// Use clip between [0, inf] as Relu to test optimized path in some EPs, such as CoreML
+TEST(MathOpTest, Clip_Relu) {
+  // To test NNAPI EP, we need the min/max to be in initializers
+  auto run_test = [](bool min_max_are_initializer) {
+    OpTester test("Clip", 11);
+
+    std::vector<int64_t> dims{3, 3};
+    test.AddInput<float>("X", dims,
+                         {-1.0f, 0.0f, 1.0f,
+                          -6.0f, 3.5f, 6.0f,
+                          -5.4f, 2.0f, 8.0f});
+    test.AddInput<float>("min", {}, {0.0f}, min_max_are_initializer);
+    test.AddOutput<float>("Y", dims,
+                          {0.0f, 0.0f, 1.0f,
+                           0.0f, 3.5f, 6.0f,
+                           0.0f, 2.0f, 8.0f});
+
+    // TensorRT does not support Clip opset 11 yet.
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  };
+
+  run_test(false);
+  run_test(true);
+}
+
 // Use clip between [-1, 1] as Relu1 (for some EPs, such as NNAPI)
 TEST(MathOpTest, Clip_Relu1) {
   // To test NNAPI EP, we need the min/max to be in initializers
diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc
index bf089e083d67e..428925e154497 100644
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@@ -281,24 +281,31 @@ using GemmOpTypedTestsTypes = ::testing::Types<float, double, MLFloat16>;
 TYPED_TEST_SUITE(GemmOpTypedTests, GemmOpTypedTestsTypes);
 
 TYPED_TEST(GemmOpTypedTests, TestGemmScalarBroadcast) {
-  OpTester test("Gemm");
+  auto run_test = [](bool b_is_initializer, bool c_is_initializer) {
+    OpTester test("Gemm");
 
-  test.AddAttribute("transA", (int64_t)0);
-  test.AddAttribute("transB", (int64_t)0);
-  test.AddAttribute("alpha", 1.0f);
-  test.AddAttribute("beta", 1.0f);
+    test.AddAttribute("transA", (int64_t)0);
+    test.AddAttribute("transB", (int64_t)0);
+    test.AddAttribute("alpha", 1.0f);
+    test.AddAttribute("beta", 1.0f);
 
-  test.AddInput<TypeParam>("A", {2, 4},
-                           {static_cast<TypeParam>(1.0f), static_cast<TypeParam>(2.0f), static_cast<TypeParam>(3.0f), static_cast<TypeParam>(4.0f),
-                            static_cast<TypeParam>(-1.0f), static_cast<TypeParam>(-2.0f), static_cast<TypeParam>(-3.0f), static_cast<TypeParam>(-4.0f)});
-  test.AddInput<TypeParam>("B", {4, 3}, std::vector<TypeParam>(12, static_cast<TypeParam>(1.0f)));
-  test.AddInput<TypeParam>("C", {1}, std::vector<TypeParam>{static_cast<TypeParam>(1.0f)});
-  test.AddOutput<TypeParam>("Y", {2, 3},
-                            {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f),
-                             static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f)});
-  test.Config(run_with_tunable_op)
-      .RunWithConfig();
+    test.AddInput<TypeParam>("A", {2, 4},
+                             {static_cast<TypeParam>(1.0f), static_cast<TypeParam>(2.0f), static_cast<TypeParam>(3.0f), static_cast<TypeParam>(4.0f),
+                              static_cast<TypeParam>(-1.0f), static_cast<TypeParam>(-2.0f), static_cast<TypeParam>(-3.0f), static_cast<TypeParam>(-4.0f)});
+    test.AddInput<TypeParam>("B", {4, 3}, std::vector<TypeParam>(12, static_cast<TypeParam>(1.0f)), b_is_initializer);
+    test.AddInput<TypeParam>("C", {1}, std::vector<TypeParam>{static_cast<TypeParam>(1.0f)}, c_is_initializer);
+    test.AddOutput<TypeParam>("Y", {2, 3},
+                              {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f),
+                               static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f)});
+    test.Config(run_with_tunable_op)
+        .RunWithConfig();
+  };
+
+  run_test(false, false);
+  // CoreML EP requires weight and bias to be initializers
+  run_test(true, true);
 }
+
 TYPED_TEST(GemmOpTypedTests, TestGemm2DBroadcast_2) {
   OpTester test("Gemm");
 
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index ee18cf2cea6cb..cbb4531a50b7c 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -75,6 +75,43 @@ TEST(BatchNormTest, PositiveTestCase) {
   input_data_map.insert({"mean", mean});
   input_data_map.insert({"var", var});
 
+  InputShapesMap input_shapes_map;
+  vector<int64_t> input_shape{1, 1, 7, 7};
+  input_shapes_map.insert({"X", input_shape});
+  input_shapes_map.insert({"scale", {1}});
+  input_shapes_map.insert({"B", {1}});
+  input_shapes_map.insert({"mean", {1}});
+  input_shapes_map.insert({"var", {1}});
+
+  auto expected_output = {1.01359f, 0.703983f, 0.641631f, 1.08571f, 0.939167f, 0.762469f, 0.682729f, 0.762401f, 0.787021f,
+                          1.06744f, 0.604378f, 0.957476f, 0.667302f, 0.901764f, 1.07566f, 1.01117f, 0.928324f, 0.897667f,
+                          0.705842f, 0.660885f, 0.977291f, 0.878918f, 0.818345f, 1.06608f, 0.839057f, 1.04796f, 0.621471f,
+                          0.781831f, 0.760527f, 0.835665f, 1.05825f, 0.611442f, 0.781873f, 1.08437f, 0.907454f, 0.926173f,
+                          1.03375f, 0.707961f, 0.968646f, 0.621757f, 0.973095f, 0.700301f, 0.916723f, 0.807602f, 0.692598f,
+                          0.621972f, 0.707334f, 0.63723f, 0.63062f};
+  float epsilon = 1e-05f;
+  TestBatchNorm(input_data_map, input_shapes_map, epsilon, expected_output, input_shape);
+}
+
+TEST(BatchNormTest, PositiveTestCase_5D) {
+  // This input was taken from the SpatialBN_1.pb, SpatialBN_1_input.pb and SpatialBN_1_output.pb files.
+  vector<float> X{0.329876f, -0.287158f, -0.411425f, 0.473621f, 0.18156f, -0.170596f, -0.329516f, -0.170733f, -0.121664f, 0.4372f,
+                  -0.485668f, 0.218049f, -0.360263f, 0.107016f, 0.45358f, 0.325056f, 0.15995f, 0.098852f, -0.283453f, -0.373051f,
+                  0.257542f, 0.0614853f, -0.0592363f, 0.434488f, -0.0179583f, 0.398374f, -0.451602f, -0.132009f, -0.174468f,
+                  -0.0247169f, 0.418897f, -0.47159f, -0.131925f, 0.470943f, 0.118357f, 0.155664f, 0.370062f, -0.279229f, 0.240311f,
+                  -0.451034f, 0.249178f, -0.294496f, 0.13683f, -0.0806475f, -0.309849f, -0.450604f, -0.28048f, -0.420197f, -0.433369f};
+  vector<float> scale{0.589433f};
+  vector<float> B{-0.384622f};
+  vector<float> mean{-2.45673f};
+  vector<float> var{1.37998f};
+
+  InputDataMap input_data_map;
+  input_data_map.insert({"X", X});
+  input_data_map.insert({"scale", scale});
+  input_data_map.insert({"B", B});
+  input_data_map.insert({"mean", mean});
+  input_data_map.insert({"var", var});
+
   InputShapesMap input_shapes_map;
   vector<int64_t> input_shape{1, 1, 7, 7, 1};
   input_shapes_map.insert({"X", input_shape});
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 10f02349a24d5..e32d171e62681 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -566,8 +566,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kDmlExecutionProvider});
 }
 
-TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) {
+  // To test CoreML/NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};