From 590a80cd83e74f93de448d5928663dff3275e6d1 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@.com>
Date: Tue, 10 Dec 2024 18:22:27 +0800
Subject: [PATCH] support coreml model cache

---
 .../coreml/coreml_provider_factory.h          |  12 ++
 onnxruntime/core/platform/env.h               |   2 +
 onnxruntime/core/platform/posix/env.cc        |   8 +
 onnxruntime/core/platform/windows/env.cc      |  10 +
 onnxruntime/core/platform/windows/env.h       |   2 +
 .../coreml/builders/model_builder.cc          |  42 +++-
 .../providers/coreml/builders/model_builder.h |   6 +-
 .../core/providers/coreml/coreml_options.cc   |  20 ++
 .../core/providers/coreml/coreml_options.h    |   4 +
 .../core/providers/coreml/model/model.mm      | 198 +++++++++++-------
 onnxruntime/test/perftest/ort_test_session.cc |   4 +-
 11 files changed, 231 insertions(+), 77 deletions(-)

diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index d035fd34bd072..12bdcddb5ae2a 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -61,6 +61,18 @@ static const char* const kCoremlProviderOption_SpecializationStrategy = "Special
 static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
 // please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
 static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
+// Specify the path to cache the model.
+// CoreML EP will convert onnx subgraph to CoreML model and save to disk.
+// If this path is not specified, the model will be saved to a temp directory and deleted after the session is closed.
+// otherwise, the model will be saved to the specified path and User should manage to delete the model.
+// The basic logic is:
+//   if (ModelCachePath != nullptr && ModelCachePath/cache_coreml.exists()) {
+//     // load from cache_coreml
+//   } else {
+//     // save to ModelCachePath
+//   }
+// we wound not detect if the cached model match the onnx subgraph, so User should carefully manage the cache for a new model.
+static const char* const kCoremlProviderOption_ModelCachePath = "ModelCachePath";
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/onnxruntime/core/platform/env.h b/onnxruntime/core/platform/env.h
index c42b31e64d129..7dbc3fe82db47 100644
--- a/onnxruntime/core/platform/env.h
+++ b/onnxruntime/core/platform/env.h
@@ -197,6 +197,7 @@ class Env {
 #ifdef _WIN32
   /// \brief Returns true if the directory exists.
   virtual bool FolderExists(const std::wstring& path) const = 0;
+  virtual bool FileExists(const std::wstring& path) const = 0;
   /// \brief Recursively creates the directory, if it doesn't exist.
   virtual common::Status CreateFolder(const std::wstring& path) const = 0;
   // Mainly for use with protobuf library
@@ -206,6 +207,7 @@ class Env {
 #endif
   /// \brief Returns true if the directory exists.
   virtual bool FolderExists(const std::string& path) const = 0;
+  virtual bool FileExists(const std::string& path) const = 0;
   /// \brief Recursively creates the directory, if it doesn't exist.
   virtual common::Status CreateFolder(const std::string& path) const = 0;
   // Recursively deletes the directory and its contents.
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index 04cf5ff6a3329..94aadf3df4d7e 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -471,6 +471,14 @@ class PosixEnv : public Env {
     return S_ISDIR(sb.st_mode);
   }
 
+  bool FileExists(const std::string& path) const override {
+    struct stat sb;
+    if (stat(path.c_str(), &sb)) {
+      return false;
+    }
+    return S_ISREG(sb.st_mode);
+  }
+
   common::Status CreateFolder(const std::string& path) const override {
     size_t pos = 0;
     do {
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 73319cd9c9b1c..4fccad6dfeb37 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -483,6 +483,16 @@ bool WindowsEnv::FolderExists(const std::string& path) const {
   return (attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY);
 }
 
+bool WindowsEnv::FileExists(const std::wstring& path) const {
+  DWORD attributes = GetFileAttributesW(path.c_str());
+  return (attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_NORMAL);
+}
+
+bool WindowsEnv::FileExists(const std::string& path) const {
+  DWORD attributes = GetFileAttributesA(path.c_str());
+  return (attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_NORMAL);
+}
+
 common::Status WindowsEnv::CreateFolder(const std::wstring& path) const {
   size_t pos = 0;
   do {
diff --git a/onnxruntime/core/platform/windows/env.h b/onnxruntime/core/platform/windows/env.h
index 395aface1d809..05b92bb6a21eb 100644
--- a/onnxruntime/core/platform/windows/env.h
+++ b/onnxruntime/core/platform/windows/env.h
@@ -68,6 +68,8 @@ class WindowsEnv : public Env {
                            MappedMemoryPtr& mapped_memory) const override;
   bool FolderExists(const std::wstring& path) const override;
   bool FolderExists(const std::string& path) const override;
+  bool FileExists(const std::wstring& path) const override;
+  bool FileExists(const std::string& path) const override;
   common::Status CreateFolder(const std::wstring& path) const override;
   common::Status CreateFolder(const std::string& path) const override;
   common::Status DeleteFolder(const PathString& path) const override;
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 6486942199df7..8c98ee0c4097e 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -410,10 +410,37 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
       coreml_version_(coreml_version),
       coreml_options_(coreml_options),
       create_ml_program_(coreml_options.CreateMLProgram()),
-      model_output_path_(GetModelOutputPath(create_ml_program_)),
       onnx_input_names_(std::move(onnx_input_names)),
       onnx_output_names_(std::move(onnx_output_names)),
       coreml_model_(std::make_unique<CoreML::Specification::Model>()) {
+  if (coreml_options.ModelCachePath().empty()) {
+    model_output_path_ = GetModelOutputPath(create_ml_program_);
+  } else {
+    // input names in onnx are unique. so we can use them as the key in the cache.
+    std::string inputs_collections = std::accumulate(
+        onnx_input_names_.begin(), onnx_input_names_.end(), std::string(),
+        [](const std::string& a, const std::string& b) { return a + "," + b; });
+    std::hash<std::string> hasher;
+    // different subgraph has different folders. so we need to hash the inputs.
+    model_output_path_ = std::string(coreml_options.ModelCachePath()) +
+                         "/" + std::to_string(hasher(inputs_collections));
+    if (!coreml_options_.CreateMLProgram()) {
+      ORT_THROW_IF_ERROR(Env::Default().CreateFolder(model_output_path_));
+      model_output_path_ += "/mlmodel";
+    }
+  }
+
+  // GetModelOutputPath(create_ml_program_) always produce a unique path for the model and this is not existed
+  // Mlprogram will create a folder while NN create a file
+  if (Env::Default().FolderExists(ToPathString(model_output_path_)) ||
+      Env::Default().FileExists(ToPathString(model_output_path_))) {
+    is_model_cached_ = true;
+    LOGS(logger, WARNING) << "Model is already cached in " << model_output_path_
+                          << " and will be reused. If you want to update the model or hit other issues, "
+                          << "please consider to clear the cache and retry.";
+    return;
+  }
+
   if (create_ml_program_) {
 #if defined(COREML_ENABLE_MLPROGRAM)
     coreml_model_->set_specificationversion(CoreMLSpecVersion());
@@ -847,6 +874,10 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
 
   input_output_info_.emplace(name, OnnxTensorInfo{data_type, shape});
 
+  if (is_model_cached_) {
+    return Status::OK();
+  }
+
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     if (is_input) {
@@ -1056,8 +1087,13 @@ Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logge
   ModelBuilder builder(graph_viewer, logger, coreml_version, coreml_options,
                        std::move(onnx_input_names), std::move(onnx_output_names));
 
-  ORT_RETURN_IF_ERROR(builder.CreateModel());
-  ORT_RETURN_IF_ERROR(builder.SaveModel());
+  if (!builder.IsModelCached()) {
+    ORT_RETURN_IF_ERROR(builder.CreateModel());
+    ORT_RETURN_IF_ERROR(builder.SaveModel());
+  } else {
+    ORT_RETURN_IF_ERROR(builder.RegisterModelInputs());
+    ORT_RETURN_IF_ERROR(builder.RegisterModelOutputs());
+  }
 
   return builder.LoadModel(model);
 }
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index e19597cf0dc2e..28c7dc42da581 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -54,6 +54,7 @@ class ModelBuilder {
   // We only support CoreML 3 and later so the spec version is always version + 1.
   int32_t CoreMLVersion() const { return coreml_version_; }
   int32_t CoreMLSpecVersion() const { return coreml_version_ + 1; }
+  bool IsModelCached() const { return is_model_cached_; }
 
   // Returns true if we are creating an ML Program
   bool CreateMLProgram() const {
@@ -218,8 +219,9 @@ class ModelBuilder {
   const logging::Logger& logger_;
   const int32_t coreml_version_;
   CoreMLOptions coreml_options_;
-  const bool create_ml_program_;         // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
-  const std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
+  const bool create_ml_program_;   // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
+  std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
+  bool is_model_cached_{false};
 
   std::vector<std::string> onnx_input_names_;
   std::vector<std::string> onnx_output_names_;
diff --git a/onnxruntime/core/providers/coreml/coreml_options.cc b/onnxruntime/core/providers/coreml/coreml_options.cc
index 4ec780208e528..5babd7633cd88 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.cc
+++ b/onnxruntime/core/providers/coreml/coreml_options.cc
@@ -5,6 +5,7 @@
 #include "core/providers/coreml/coreml_provider_factory.h"  // defines flags
 #include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/coreml/builders/helper.h"
+#include "core/platform/env.h"
 
 namespace onnxruntime {
 
@@ -71,6 +72,7 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
       kCoremlProviderOption_SpecializationStrategy,
       kCoremlProviderOption_ProfileComputePlan,
       kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU,
+      kCoremlProviderOption_ModelCachePath,
   };
   // Validate the options
   for (const auto& option : options) {
@@ -103,7 +105,25 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
       profile_compute_plan_ = option.second == "1";
     } else if (kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU == option.first) {
       allow_low_precision_accumulation_on_gpu_ = option.second == "1";
+    } else if (kCoremlProviderOption_ModelCachePath == option.first) {
+      model_cache_path_ = option.second;
     }
   }
+
+  // Set the model cache path with equireStaticShape and ModelFormat
+  if (model_cache_path_.size()) {
+    if (require_static_shape_) {
+      model_cache_path_ += "/static_shape";
+    } else {
+      model_cache_path_ += "/dynamic_shape";
+    }
+
+    if (create_mlprogram_) {
+      model_cache_path_ += "/mlpackage";
+    } else {
+      model_cache_path_ += "/mlnnmodel";
+    }
+    ORT_THROW_IF_ERROR(Env::Default().CreateFolder(model_cache_path_));
+  }
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/coreml_options.h b/onnxruntime/core/providers/coreml/coreml_options.h
index fd05c96927bd1..1ec4294492552 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.h
+++ b/onnxruntime/core/providers/coreml/coreml_options.h
@@ -17,6 +17,8 @@ class CoreMLOptions {
   std::string strategy_;
   bool profile_compute_plan_{false};
   bool allow_low_precision_accumulation_on_gpu_{false};
+  // path to store the converted coreml model
+  std::string model_cache_path_;
 
  public:
   explicit CoreMLOptions(uint32_t coreml_flags);
@@ -32,6 +34,8 @@ class CoreMLOptions {
   bool UseStrategy(std::string_view strategy) const { return strategy_ == strategy; }
   bool ProfileComputePlan() const { return profile_compute_plan_ && create_mlprogram_; }
 
+  std::string_view ModelCachePath() const { return model_cache_path_; }
+
  private:
   void ValidateAndParseProviderOption(const ProviderOptions& options);
 };
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 755dbfbd6e68c..23de66073c928 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -301,53 +301,116 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
   return Status::OK();
 }
 
-// since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
-// We are actually ensure the MacOS/IOS version and Xcode version is greater than `macOS 14.4, iOS 17.4`.
-// The macro API_AVAILABLE should also be fine.
+// since macos(14.4), ios(17.4), MLComputePlan is introduced in <CoreML/CoreML.h>
 // Otherwise, the compiler will complain `MLComputePlan` is not defined.
 // we define __clang_analyzer__ here is for bypass static analysis
+API_AVAILABLE(macos(14.4), ios(17.4), tvos(17.4), watchos(10.4))
+void ProfileBlock(MLComputePlan* _Nullable computePlan, MLModelStructureProgramBlock* block) {
+  for (MLModelStructureProgramOperation* operation in block.operations) {
+    for (size_t i = 0; i < operation.blocks.count; ++i) {
+      ProfileBlock(computePlan, operation.blocks[i]);
+    }
+    // Get the compute device usage for the operation.
+    MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
+    id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
+    // Get the estimated cost of executing the operation.
+    MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
+    if (![operation.operatorName isEqualToString:@"const"]) {
+      NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
+    }
+  }
+}
+// since macos(14.4), ios(17.4), MLComputePlan is introduced in <CoreML/CoreML.h>
+// Otherwise, the compiler will complain `MLComputePlan` is not defined.
+// we define __clang_analyzer__ here is for bypass static analysis
+API_AVAILABLE(macos(14.4), ios(17.4), tvos(17.4), watchos(10.4))
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
-#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
-  if (@available(macOS 14.4, iOS 17.4, *)) {
-    [MLComputePlan loadContentsOfURL:compileUrl
-                       configuration:config
-                   completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
-                     if (!computePlan) {
-                       NSLog(@"Error loading compute plan: %@", error);
-                       // Handle error.
-                       return;
-                     }
-                     MLModelStructureProgram* program = computePlan.modelStructure.program;
-                     if (!program) {
-                       NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
-                       return;
-                     }
-
-                     MLModelStructureProgramFunction* mainFunction = program.functions[@"main"];
-                     if (!mainFunction) {
-                       NSLog(@"Error loading main function from program");
-                       return;
-                     }
-
-                     NSArray<MLModelStructureProgramOperation*>* operations = mainFunction.block.operations;
-                     NSLog(@"Number of operations, 'const' node is included. : %lu", operations.count);
-                     for (MLModelStructureProgramOperation* operation in operations) {
-                       // Get the compute device usage for the operation.
-                       MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
-                       id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
-                       // Get the estimated cost of executing the operation.
-                       MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
-                       if (![operation.operatorName isEqualToString:@"const"]) {
-                         NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
-                       }
-                     }
+#if !defined(__clang_analyzer__)
+  dispatch_semaphore_t fd_sema = dispatch_semaphore_create(0);
+  [MLComputePlan loadContentsOfURL:compileUrl
+                     configuration:config
+                 completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
+                   if (!computePlan) {
+                     NSLog(@"Error loading compute plan: %@", error);
+                     // Handle error.
+                     return;
+                   }
+                   MLModelStructureProgram* program = computePlan.modelStructure.program;
+                   if (!program) {
+                     NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
+                     return;
+                   }
+
+                   [computePlan.modelStructure.program.functions enumerateKeysAndObjectsUsingBlock:^(NSString* function_name,
+                                                                                                     MLModelStructureProgramFunction* function,
+                                                                                                     BOOL* _Nonnull __unused stop) {
+                     NSLog(@"profile function : %@", function_name);
+                     ProfileBlock(computePlan, function.block);
+                     dispatch_semaphore_signal(fd_sema);
                    }];
+                 }];
+  long status = dispatch_semaphore_wait(fd_sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(5 * 60 * NSEC_PER_SEC)));
+  if (status != 0) {
+    NSLog(@"profile function : timeout");
+  }
+#endif
+}
+
+API_AVAILABLE(macos(15.0), ios(18.0), tvos(18.0))
+void ConfigureOptimizationHints(MLModelConfiguration* config, const CoreMLOptions& coreml_options) {
+#if !defined(__clang_analyzer__)
+  MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
+  if (coreml_options.UseStrategy("FastPrediction")) {
+    optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
+    config.optimizationHints = optimizationHints;
+  } else if (coreml_options.UseStrategy("Default")) {
+    optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
+    config.optimizationHints = optimizationHints;
   } else {
-    NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
+    // not set
   }
 #endif
 }
 
+Status CompileOrReadCachedModel(NSURL* modelUrl, const CoreMLOptions& coreml_options,
+                                NSMutableString* compiled_model_path) {
+  NSURL* cached_model_base_url = modelUrl;
+  if (!coreml_options.CreateMLProgram()) {
+    cached_model_base_url = [cached_model_base_url URLByDeletingLastPathComponent];
+  }
+  NSURL* cached_model_url = [cached_model_base_url URLByAppendingPathComponent:@"compiled_model.mlmodelc"];
+
+  // if cached_model_url is existed, just return
+  NSError* error = nil;
+  if ([[NSFileManager defaultManager] fileExistsAtPath:[cached_model_url path]]) {
+    [compiled_model_path appendString:[cached_model_url path]];
+    return Status::OK();
+  }
+
+  // TODO: Update this to version with callback handler as the API used here is deprecated.
+  // https://developer.apple.com/documentation/coreml/mlmodel/3929553-compilemodelaturl
+  // As we call loadModel during EP Compile there shouldn't be an issue letting the actual compile run in the
+  // background. We will have to check for completion in `predict` and block until it is done.
+  NSURL* compiled_model_url = [MLModel compileModelAtURL:modelUrl error:&error];
+  if (error != nil) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error compiling model: ",
+                           [[error localizedDescription] UTF8String]);
+  }
+
+  if (coreml_options.ModelCachePath().empty()) {
+    [compiled_model_path appendString:[compiled_model_url path]];
+    return Status::OK();
+  }
+
+  // save the compiled model if user has set a cache path
+  if (![[NSFileManager defaultManager] moveItemAtURL:compiled_model_url toURL:cached_model_url error:&error]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error copying compiled model to cache path: ",
+                           [[cached_model_url path] UTF8String], ", reason: ", [[error localizedDescription] UTF8String]);
+  }
+  [compiled_model_path appendString:[cached_model_url path]];
+  return Status::OK();
+}
+
 // Internal Execution class
 // This class is part of the model class and handles the calls into CoreML. Specifically, it performs
 // 1. Compile the model by given path for execution
@@ -366,7 +429,7 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
  private:
   void cleanup();
   NSString* coreml_model_path_{nil};
-  NSString* compiled_model_path_{nil};
+  NSURL* compiled_model_url_{nil};
   const logging::Logger& logger_;
   CoreMLOptions coreml_options_;
   MLModel* model_{nil};
@@ -387,14 +450,18 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
 }
 
 void Execution::cleanup() {
+  // we keep the compiled model if the user has set a cache path
+  if (coreml_options_.ModelCachePath().size()) {
+    return;
+  }
+  NSString* compiled_model_path = [compiled_model_url_ path];
   NSError* error = nil;
-  if (compiled_model_path_ != nil) {
-    [[NSFileManager defaultManager] removeItemAtPath:compiled_model_path_ error:&error];
+  if (compiled_model_path != nil) {
+    [[NSFileManager defaultManager] removeItemAtPath:compiled_model_path error:&error];
     if (error != nil) {
-      LOGS(logger_, ERROR) << "Failed cleaning up the compiled model: " << [compiled_model_path_ UTF8String]
+      LOGS(logger_, ERROR) << "Failed cleaning up the compiled model: " << [compiled_model_path UTF8String]
                            << ", error message: " << [[error localizedDescription] UTF8String];
     }
-    compiled_model_path_ = nil;
   }
 
 #if !defined(NDEBUG)
@@ -430,17 +497,10 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create model URL from path");
       }
 
-      // TODO: Update this to version with callback handler as the API used here is deprecated.
-      // https://developer.apple.com/documentation/coreml/mlmodel/3929553-compilemodelaturl
-      // As we call loadModel during EP Compile there shouldn't be an issue letting the actual compile run in the
-      // background. We will have to check for completion in `predict` and block until it is done.
-      NSURL* compileUrl = [MLModel compileModelAtURL:modelUrl error:&error];
-      if (error != nil) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Error compiling model: ",
-                               [[error localizedDescription] UTF8String]);
-      }
-
-      compiled_model_path_ = [compileUrl path];
+      NSMutableString* compiled_model_path = [[NSMutableString alloc] init];
+      ORT_RETURN_IF_ERROR(CompileOrReadCachedModel(
+          [NSURL fileURLWithPath:coreml_model_path_], coreml_options_, compiled_model_path));
+      compiled_model_url_ = [NSURL fileURLWithPath:compiled_model_path];
 
       MLModelConfiguration* config = [[MLModelConfiguration alloc] init];
       uint32_t coreml_compute_unit = coreml_options_.ComputeUnits();
@@ -458,27 +518,23 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
         config.allowLowPrecisionAccumulationOnGPU = YES;
       }
 
-// Set the specialization strategy to FastPrediction  for macOS 10.15+
-// since __clang_major__ >= 15, optimizationHints is introduced in <CoreML/CoreML.h>
-// Same as above comments for why we are checking __clang_major__.
-// we define __clang_analyzer__ here is for bypass static analysis
-#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
-      if (HAS_COREML8_OR_LATER) {
-        MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
-        if (coreml_options_.UseStrategy("FastPrediction")) {
-          optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
-          config.optimizationHints = optimizationHints;
-        } else if (coreml_options_.UseStrategy("Default")) {
-          optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
-          config.optimizationHints = optimizationHints;
-        }
+      // Set the specialization strategy to FastPrediction  for macOS 10.15+
+      // we define __clang_analyzer__ here is for bypass static analysis
+      if (@available(macOS 15.0, iOS 18.0, *)) {
+        ConfigureOptimizationHints(config, coreml_options_);
+      } else {
+        LOGS(logger_, WARNING) << "iOS 17.4+/macOS 14.4+ or later is required to ConfigureOptimizationHints";
       }
-#endif
+
       if (coreml_options_.ProfileComputePlan()) {
-        ProfileComputePlan(compileUrl, config);
+        if (@available(macOS 14.4, iOS 17.4, *)) {
+          ProfileComputePlan(compiled_model_url_, config);
+        } else {
+          LOGS(logger_, WARNING) << "iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API";
+        }
       }
 
-      model_ = [MLModel modelWithContentsOfURL:compileUrl configuration:config error:&error];
+      model_ = [MLModel modelWithContentsOfURL:compiled_model_url_ configuration:config error:&error];
 
       if (error != nil || model_ == nil) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create MLModel",
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index a96028ed3903e..0a09595d67252 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -349,7 +349,8 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
                                                                    kCoremlProviderOption_EnableOnSubgraphs,
                                                                    kCoremlProviderOption_SpecializationStrategy,
                                                                    kCoremlProviderOption_ProfileComputePlan,
-                                                                   kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU};
+                                                                   kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU,
+                                                                   kCoremlProviderOption_ModelCachePath};
     ParseSessionConfigs(ov_string, provider_options, available_keys);
 
     std::unordered_map<std::string, std::string> available_options = {
@@ -373,6 +374,7 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
                  (provider_option.second == "0" || provider_option.second == "1")) {
       } else if (provider_option.first == kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU &&
                  (provider_option.second == "0" || provider_option.second == "1")) {
+      } else if (provider_option.first == kCoremlProviderOption_ModelCachePath) {
       } else {
         ORT_THROW("Invalid value for option ", provider_option.first, ": ", provider_option.second);
       }