Skip to content

Commit

Permalink
[CoreML] more performace flag (microsoft#22975)
Browse files Browse the repository at this point in the history
### Description
refactor unsquzee's implementation
add more flags to boost peformance.
add profile flag


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: jicwen <[email protected]>
Co-authored-by: wejoncy <[email protected]>
Co-authored-by: Scott McKay <[email protected]>
  • Loading branch information
4 people authored and tarekziade committed Jan 10, 2025
1 parent d585d14 commit b834143
Show file tree
Hide file tree
Showing 13 changed files with 173 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,20 @@ enum COREMLFlags {
// and SessionOptionsAppendExecutionProvider (C API). For the old API, use COREMLFlags instead.
static const char* const kCoremlProviderOption_MLComputeUnits = "MLComputeUnits";
static const char* const kCoremlProviderOption_ModelFormat = "ModelFormat";
// same as COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES
static const char* const kCoremlProviderOption_RequireStaticInputShapes = "RequireStaticInputShapes";
static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubgraphs";
// provided by https://developer.apple.com/documentation/coreml/mloptimizationhints-swift.struct/specializationstrategy-swift.property
// Core ML segments the model’s compute graph and specializes each segment for the target compute device.
// This process can affect the model loading time and the prediction latency.
// Use this option to tailor the specialization strategy for your model.
static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
// Profile the Core ML MLComputePlan.
// This logs the hardware each operator is dispatched to and the estimated execution time.
// Intended for developer usage but provide useful diagnostic information if performance is not as expected.
static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
// please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";

#ifdef __cplusplus
extern "C" {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ bool BatchNormalizationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBu
return false;
}

#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64) && TARGET_OS_IOS && TARGET_CPU_X86_64
// To Pass IOS pipeline https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=134&_a=summary
auto input_dtype = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && input_params.coreml_version < 7) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,8 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInpu
return false;
}

#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
// to pass https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1563483&view=logs&j=f7cc61a9-cc70-56e7-b06c-4668ca17e426
// ReductionOpTest.ReduceSum_half_bert
#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64) && TARGET_OS_IOS && TARGET_CPU_X86_64
// skip ReductionOpTest.ReduceSum_half_bert because reduce_sum will output all zeros
int32_t input_type;
GetType(*input_defs[0], input_type, logger);
if (node.OpType() == "ReduceSum" && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#include "core/optimizer/initializer.h"
#include "core/providers/cpu/tensor/unsqueeze.h"

#ifdef __APPLE__
#include <TargetConditionals.h>
#endif

namespace onnxruntime {
namespace coreml {

Expand Down Expand Up @@ -54,32 +58,50 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
}
}

#if defined(COREML_ENABLE_MLPROGRAM)
void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
const Node& node, const logging::Logger& logger) {
const auto& input_defs(node.InputDefs());
TensorShapeVector axes;
GetAxes(model_builder, node, axes);

std::vector<int64_t> input_shape;
GetShape(*input_defs[0], input_shape, logger);
auto op = model_builder.CreateOperation(node, "reshape");
AddOperationInput(*op, "x", input_defs[0]->Name());
TensorShapeVector output_shape = UnsqueezeBase::ComputeOutputShape(TensorShape(input_shape), axes);
AddOperationInput(*op, "shape", model_builder.AddConstant(op->type(), "shape", AsSpan(output_shape)));
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
}
#endif

Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const Node& node,
[[maybe_unused]] const logging::Logger& logger) const {
std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
const auto& input_defs(node.InputDefs());
auto* coreml_squeeze = layer->mutable_squeeze();
TensorShapeVector axes;
GetAxes(model_builder, node, axes);
std::vector<int64_t> input_shape;
GetShape(*input_defs[0], input_shape, logger);
#if defined(COREML_ENABLE_MLPROGRAM)
const auto& input_defs(node.InputDefs());
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;

std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "reshape";
#if defined(TARGET_CPU_X86_64) && TARGET_CPU_X86_64
// expand_dims has limited requirements for static shape, however, X86_64 has a bug that it can't handle scalar input
if (node.OpType() == "Unsqueeze" && input_defs[0]->Shape()->dim_size() < 2) {
HandleX86ArchUnsqueezeScalarInput(model_builder, node, logger);
return Status::OK();
}
#endif
std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "expand_dims";
std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
AddOperationInput(*op, "x", input_defs[0]->Name());

if (coreml_op_type == "squeeze") {
if (!axes.empty()) {
// coreml squeeze op does support negative axes
AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
}
} else {
TensorShapeVector output_shape = UnsqueezeBase::ComputeOutputShape(TensorShape(input_shape), axes);
AddOperationInput(*op, "shape", model_builder.AddConstant(op->type(), "shape", AsSpan(output_shape)));
if (!axes.empty()) {
// coreml supports negative axes
AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
}
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/core/providers/coreml/builders/model_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
: graph_viewer_(graph_viewer),
logger_(logger),
coreml_version_(coreml_version),
coreml_compute_unit_(coreml_options.ComputeUnits()),
coreml_options_(coreml_options),
create_ml_program_(coreml_options.CreateMLProgram()),
model_output_path_(GetModelOutputPath(create_ml_program_)),
onnx_input_names_(std::move(onnx_input_names)),
Expand Down Expand Up @@ -989,7 +989,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
get_sanitized_io_info(std::move(input_output_info_)),
std::move(scalar_outputs_),
std::move(int64_outputs_),
logger_, coreml_compute_unit_);
logger_, coreml_options_);
} else
#endif
{
Expand All @@ -999,7 +999,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
std::move(input_output_info_),
std::move(scalar_outputs_),
std::move(int64_outputs_),
logger_, coreml_compute_unit_);
logger_, coreml_options_);
}

return model->LoadModel(); // load using CoreML API, including compilation
Expand Down
5 changes: 2 additions & 3 deletions onnxruntime/core/providers/coreml/builders/model_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "core/graph/graph_viewer.h"
#include "core/providers/coreml/builders/coreml_spec.h"
#include "core/providers/coreml/model/model.h"
#include "core/providers/coreml/coreml_options.h"

#if defined(COREML_ENABLE_MLPROGRAM)
// coremltools classes
Expand All @@ -22,8 +23,6 @@ class StorageWriter;
#endif

namespace onnxruntime {
class CoreMLOptions;

namespace coreml {

class IOpBuilder;
Expand Down Expand Up @@ -218,7 +217,7 @@ class ModelBuilder {
const GraphViewer& graph_viewer_;
const logging::Logger& logger_;
const int32_t coreml_version_;
const uint32_t coreml_compute_unit_;
CoreMLOptions coreml_options_;
const bool create_ml_program_; // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
const std::string model_output_path_; // create_ml_program_ ? dir for mlpackage : filename for mlmodel

Expand Down
15 changes: 14 additions & 1 deletion onnxruntime/core/providers/coreml/coreml_options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,14 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
{"MLProgram", COREML_FLAG_CREATE_MLPROGRAM},
{"NeuralNetwork", COREML_FLAG_USE_NONE},
};
std::unordered_set<std::string> valid_options = {
const std::unordered_set<std::string_view> valid_options = {
kCoremlProviderOption_MLComputeUnits,
kCoremlProviderOption_ModelFormat,
kCoremlProviderOption_RequireStaticInputShapes,
kCoremlProviderOption_EnableOnSubgraphs,
kCoremlProviderOption_SpecializationStrategy,
kCoremlProviderOption_ProfileComputePlan,
kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU,
};
// Validate the options
for (const auto& option : options) {
Expand All @@ -90,6 +93,16 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
require_static_shape_ = option.second == "1";
} else if (kCoremlProviderOption_EnableOnSubgraphs == option.first) {
enable_on_subgraph_ = option.second == "1";
} else if (kCoremlProviderOption_SpecializationStrategy == option.first) {
if (option.second != "Default" && option.second != "FastPrediction") {
ORT_THROW("Invalid value for option ", option.first, ": ", option.second,
". Valid values are Default and FastPrediction.");
}
strategy_ = option.second;
} else if (kCoremlProviderOption_ProfileComputePlan == option.first) {
profile_compute_plan_ = option.second == "1";
} else if (kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU == option.first) {
allow_low_precision_accumulation_on_gpu_ = option.second == "1";
}
}
}
Expand Down
6 changes: 6 additions & 0 deletions onnxruntime/core/providers/coreml/coreml_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ class CoreMLOptions {
bool create_mlprogram_{false};
bool enable_on_subgraph_{false};
uint32_t compute_units_{0};
std::string strategy_;
bool profile_compute_plan_{false};
bool allow_low_precision_accumulation_on_gpu_{false};

public:
explicit CoreMLOptions(uint32_t coreml_flags);
Expand All @@ -25,6 +28,9 @@ class CoreMLOptions {
bool CreateMLProgram() const { return create_mlprogram_; }
bool EnableOnSubgraph() const { return enable_on_subgraph_; }
uint32_t ComputeUnits(uint32_t specific_flag = 0xffffffff) const { return compute_units_ & specific_flag; }
bool AllowLowPrecisionAccumulationOnGPU() const { return allow_low_precision_accumulation_on_gpu_; }
bool UseStrategy(std::string_view strategy) const { return strategy_ == strategy; }
bool ProfileComputePlan() const { return profile_compute_plan_ && create_mlprogram_; }

private:
void ValidateAndParseProviderOption(const ProviderOptions& options);
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/providers/coreml/model/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#endif

namespace onnxruntime {
class CoreMLOptions;
namespace coreml {

class Execution;
Expand Down Expand Up @@ -53,7 +54,7 @@ class Model {
std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
std::unordered_set<std::string>&& scalar_outputs,
std::unordered_set<std::string>&& int64_outputs,
const logging::Logger& logger, uint32_t coreml_compute_unit);
const logging::Logger& logger, const CoreMLOptions& coreml_options);

~Model();
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Model);
Expand Down
92 changes: 82 additions & 10 deletions onnxruntime/core/providers/coreml/model/model.mm
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "core/providers/coreml/model/host_utils.h"
#include "core/providers/coreml/model/objc_str_utils.h"
#include "core/providers/coreml/shape_utils.h"
#include "core/providers/coreml/coreml_options.h"

// force the linker to create a dependency on the CoreML framework so that in MAUI usage we don't need
// to manually do this
Expand Down Expand Up @@ -300,14 +301,61 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
return Status::OK();
}

// since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
// We are actually ensure the MacOS/IOS version and Xcode version is greater than `macOS 14.4, iOS 17.4`.
// The macro API_AVAILABLE should also be fine.
// Otherwise, the compiler will complain `MLComputePlan` is not defined.
// we define __clang_analyzer__ here is for bypass static analysis
void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
if (@available(macOS 14.4, iOS 17.4, *)) {
[MLComputePlan loadContentsOfURL:compileUrl
configuration:config
completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
if (!computePlan) {
NSLog(@"Error loading compute plan: %@", error);
// Handle error.
return;
}
MLModelStructureProgram* program = computePlan.modelStructure.program;
if (!program) {
NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
return;
}

MLModelStructureProgramFunction* mainFunction = program.functions[@"main"];
if (!mainFunction) {
NSLog(@"Error loading main function from program");
return;
}

NSArray<MLModelStructureProgramOperation*>* operations = mainFunction.block.operations;
NSLog(@"Number of operations, 'const' node is included. : %lu", operations.count);
for (MLModelStructureProgramOperation* operation in operations) {
// Get the compute device usage for the operation.
MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
// Get the estimated cost of executing the operation.
MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
if (![operation.operatorName isEqualToString:@"const"]) {
NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
}
}
}];
} else {
NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
}
#endif
}

// Internal Execution class
// This class is part of the model class and handles the calls into CoreML. Specifically, it performs
// 1. Compile the model by given path for execution
// 2. Predict using given OnnxTensorFeatureProvider input and copy the output data back ORT
// 3. The compiled model will be removed in dealloc or removed using cleanup function
class Execution {
public:
Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags);
Execution(const std::string& path, const logging::Logger& logger, const CoreMLOptions& coreml_options);
~Execution();

Status LoadModel();
Expand All @@ -320,13 +368,13 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
NSString* coreml_model_path_{nil};
NSString* compiled_model_path_{nil};
const logging::Logger& logger_;
uint32_t coreml_compute_unit_{0};
CoreMLOptions coreml_options_;
MLModel* model_{nil};
};

Execution::Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_compute_unit)
Execution::Execution(const std::string& path, const logging::Logger& logger, const CoreMLOptions& coreml_options)
: logger_(logger),
coreml_compute_unit_(coreml_compute_unit) {
coreml_options_(coreml_options) {
@autoreleasepool {
coreml_model_path_ = util::Utf8StringToNSString(path.c_str());
}
Expand Down Expand Up @@ -395,17 +443,41 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
compiled_model_path_ = [compileUrl path];

MLModelConfiguration* config = [[MLModelConfiguration alloc] init];

if (coreml_compute_unit_ & COREML_FLAG_USE_CPU_ONLY) {
uint32_t coreml_compute_unit = coreml_options_.ComputeUnits();
if (coreml_compute_unit & COREML_FLAG_USE_CPU_ONLY) {
config.computeUnits = MLComputeUnitsCPUOnly;
} else if (coreml_compute_unit_ & COREML_FLAG_USE_CPU_AND_GPU) {
} else if (coreml_compute_unit & COREML_FLAG_USE_CPU_AND_GPU) {
config.computeUnits = MLComputeUnitsCPUAndGPU;
} else if (coreml_compute_unit_ & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) {
} else if (coreml_compute_unit & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) {
config.computeUnits = MLComputeUnitsCPUAndNeuralEngine; // Apple Neural Engine
} else {
config.computeUnits = MLComputeUnitsAll;
}

if (coreml_options_.AllowLowPrecisionAccumulationOnGPU()) {
config.allowLowPrecisionAccumulationOnGPU = YES;
}

// Set the specialization strategy to FastPrediction for macOS 10.15+
// since __clang_major__ >= 15, optimizationHints is introduced in <CoreML/CoreML.h>
// Same as above comments for why we are checking __clang_major__.
// we define __clang_analyzer__ here is for bypass static analysis
#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
if (HAS_COREML8_OR_LATER) {
MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
if (coreml_options_.UseStrategy("FastPrediction")) {
optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
config.optimizationHints = optimizationHints;
} else if (coreml_options_.UseStrategy("Default")) {
optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
config.optimizationHints = optimizationHints;
}
}
#endif
if (coreml_options_.ProfileComputePlan()) {
ProfileComputePlan(compileUrl, config);
}

model_ = [MLModel modelWithContentsOfURL:compileUrl configuration:config error:&error];

if (error != nil || model_ == nil) {
Expand Down Expand Up @@ -524,8 +596,8 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
std::unordered_set<std::string>&& scalar_outputs,
std::unordered_set<std::string>&& int64_outputs,
const logging::Logger& logger,
uint32_t coreml_flags)
: execution_(std::make_unique<Execution>(path, logger, coreml_flags)),
const CoreMLOptions& coreml_options)
: execution_(std::make_unique<Execution>(path, logger, coreml_options)),
model_input_names_(std::move(model_input_names)),
model_output_names_(std::move(model_output_names)),
input_output_info_(std::move(input_output_info)),
Expand Down
Loading

0 comments on commit b834143

Please sign in to comment.