Skip to content

Commit

Permalink
avoid redundant memory allocation for external initializers (#21682)
Browse files Browse the repository at this point in the history
### Description
avoid redundant memory allocation for external initializers, we will use
mmap for external initializers later so no point to allocate memory in
advance then release them later.



### Motivation and Context
In current implementation, we will:
1. Allocate memory (with desired size of current initializer) for
initializer first:
[https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/session_state_utils.cc#L131](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fmicrosoft%2Fonnxruntime%2Fblob%2Fmain%2Fonnxruntime%2Fcore%2Fframework%2Fsession_state_utils.cc%23L131&data=05%7C02%7Cfrdong%40microsoft.com%7C1e126797c95149aa217d08dcb781cc60%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638587015340041125%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6fN57MUsergrCX%2BBS7jztWBRmc8nx19EVvn0lUJ2Gtk%3D&reserved=0)
2. For external initializer, we will point initializer to mmaped object
in memory and release previously allocated tensor:
[https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/session_state_utils.cc#L89](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fmicrosoft%2Fonnxruntime%2Fblob%2Fmain%2Fonnxruntime%2Fcore%2Fframework%2Fsession_state_utils.cc%23L89&data=05%7C02%7Cfrdong%40microsoft.com%7C1e126797c95149aa217d08dcb781cc60%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638587015340054491%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=yBtXLc%2Bhpx3IT1%2FX0664foqQ5X5O%2Fy5XNhj4Oed%2BAt4%3D&reserved=0)

For large models, we are keep allocating and release memory for external
initializers which seems unnecessary.

For phi silica model, with this change we can reduce transient memory
usage from 4,566MB to 2,724MB. Since these redundant memory is released
quickly when we mmap external initializers so this change has no much
impact on peak memory usage.
  • Loading branch information
frank-dong-ms authored Aug 14, 2024
1 parent 7172aff commit a0708a0
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 57 deletions.
175 changes: 118 additions & 57 deletions onnxruntime/core/framework/session_state_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,28 +113,14 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
TensorShape tensor_shape = utils::GetTensorShapeFromTensorProto(tensor_proto);
const DataTypeImpl* const type = DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto.data_type())->GetElementType();
std::unique_ptr<Tensor> p_tensor;
if (m != nullptr) {
p_tensor = std::make_unique<Tensor>(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo());
if (m->GetLen() < p_tensor->SizeInBytes()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error. The preallocated buffer is too small. Requires ",
p_tensor->SizeInBytes(), ", Got ", m->GetLen());
}
} else {
if (use_device_allocator_for_initializers) {
void* tensor_buffer = nullptr;
ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, alloc, tensor_buffer));
p_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, alloc);
} else {
// If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
// (may expand it if there isn't a chunk that can be allotted to the memory request).
// If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
p_tensor = std::make_unique<Tensor>(type, tensor_shape, alloc);
}
}

if (p_tensor->Location().device.Type() == OrtDevice::CPU) {
// deserialize directly to CPU tensor
if (utils::HasExternalData(tensor_proto)) {
auto device_type = (alloc != nullptr) ? alloc->Info().device.Type() : m->GetAllocInfo().device.Type();

if (utils::HasExternalData(tensor_proto)) {
if (device_type == OrtDevice::CPU) {
// for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance
p_tensor = std::make_unique<Tensor>(type, TensorShape(), alloc);

// NB: The file containing external data for the tensor is mmap'd. If the tensor will be used on CPU we can
// utilize the mmap'd buffer directly by calling ExtDataTensorProtoToTensor. If we called
// TensorProtoToTensor it would copy the data, causing unnecessary overhead
Expand All @@ -143,57 +129,132 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
ext_data_deleter, buffered_tensor));

ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()};

MLDataType ml_tensor_type = DataTypeImpl::GetType<Tensor>();
ort_value.Init(p_tensor.release(), ml_tensor_type, deleter);
return common::Status::OK();
}
ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_tensor));
} else { // non-cpu tensor
if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
}
} else { // non-cpu tensor
if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
}

// deserialize to CPU first for non-CPU allocator, then copy
std::unique_ptr<Tensor> p_deserialize_tensor;
if (use_device_allocator_for_initializers) {
void* tensor_buffer = nullptr;
ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, default_cpu_alloc, tensor_buffer));
p_deserialize_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, default_cpu_alloc);
} else {
// If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
// (may expand it if there isn't a chunk that can be allotted to the memory request).
// If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
p_deserialize_tensor = std::make_unique<Tensor>(type, tensor_shape, default_cpu_alloc);
}
// deserialize to CPU first for non-CPU allocator, then copy to device
// for external initializer load on non-CPU device:
// 1. allocate memory on device - p_tensor
// 2. load initializer into CPU memory - p_deserialize_tensor,
// we will use mmap so no need to allocate memory on CPU in advance
// 3. copy tensor from CPU to device - p_deserialize_tensor -> p_tensor
auto allocate_on_device_status = AllocateTensor(m, p_tensor, type, tensor_shape, use_device_allocator_for_initializers, alloc);
if (!allocate_on_device_status.IsOK()) {
return allocate_on_device_status;
}

std::unique_ptr<Tensor> p_deserialize_tensor = std::make_unique<Tensor>(type, TensorShape(), default_cpu_alloc);

OrtCallback ext_data_deleter;
std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
if (utils::HasExternalData(tensor_proto)) {
OrtCallback ext_data_deleter;
std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor,
ext_data_deleter, buffered_tensor));
scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter);
} else {
ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_deserialize_tensor));
// TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.

return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
}
// TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.

Status copy_status = data_transfer_mgr.CopyTensor(*p_deserialize_tensor, *p_tensor);
if (!copy_status.IsOK()) {
if (copy_status.ErrorMessage().empty()) {
// The windows execution provider does not return any error message today for CopyTensor since it is
// not implemented yet. That's the reason we're adding our own error message so that we can debug better.
return Status(copy_status.Category(), copy_status.Code(),
"Failed to copy tensor to " + p_tensor->Location().ToString());
} else {
// for internal initializer, always allocate memory on device - p_tensor
auto allocate_on_device_status = AllocateTensor(m, p_tensor, type, tensor_shape, use_device_allocator_for_initializers, alloc);
if (!allocate_on_device_status.IsOK()) {
return allocate_on_device_status;
}

if (device_type == OrtDevice::CPU) {
// deserialize directly to CPU tensor
ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_tensor));
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
return common::Status::OK();
} else { // non-cpu tensor
if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
}

// deserialize to CPU first for non-CPU allocator, then copy
// for internal initializer
// 1. allocate memory on CPU - p_deserialize_tensor
// 2. deserialize tensor_probo into a preallocated tensor (p_deserialize_tensor)
// 3. copy tensor from CPU to device - p_deserialize_tensor -> p_tensor
std::unique_ptr<Tensor> p_deserialize_tensor;
auto allocate_on_cpu_status = AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, default_cpu_alloc, p_deserialize_tensor);
if (!allocate_on_cpu_status.IsOK()) {
return allocate_on_cpu_status;
}
return copy_status;

ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_deserialize_tensor));
// TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.

return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
}
}
}

common::Status AllocateTensor(
const onnxruntime::MemBuffer* m,
std::unique_ptr<onnxruntime::Tensor>& p_tensor,
const onnxruntime::DataTypeImpl* const& type,
onnxruntime::TensorShape& tensor_shape,
bool use_device_allocator_for_initializers,
const onnxruntime::AllocatorPtr& alloc) {
if (m != nullptr) {
p_tensor = std::make_unique<Tensor>(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo());
if (m->GetLen() < p_tensor->SizeInBytes()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error. The preallocated buffer is too small. Requires ",
p_tensor->SizeInBytes(), ", Got ", m->GetLen());
}
} else {
return AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, alloc, p_tensor);
}
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
return common::Status::OK();
}

common::Status AllocateTensorOnDeviceOrMemory(
bool use_device_allocator_for_initializers,
onnxruntime::TensorShape& tensor_shape,
const onnxruntime::DataTypeImpl* const& type,
const onnxruntime::AllocatorPtr& alloc,
std::unique_ptr<onnxruntime::Tensor>& p_tensor) {
if (use_device_allocator_for_initializers) {
void* tensor_buffer = nullptr;
ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, alloc, tensor_buffer));
p_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, alloc);
} else {
// If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
// (may expand it if there isn't a chunk that can be allotted to the memory request).
// If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
p_tensor = std::make_unique<Tensor>(type, tensor_shape, alloc);
}
return common::Status::OK();
}

common::Status CopyTensorFromCPUToDevice(
const onnxruntime::DataTransferManager& data_transfer_mgr,
std::unique_ptr<onnxruntime::Tensor>& p_deserialize_tensor,
std::unique_ptr<onnxruntime::Tensor>& p_tensor,
OrtValue& ort_value) {
Status copy_status = data_transfer_mgr.CopyTensor(*p_deserialize_tensor, *p_tensor);
if (!copy_status.IsOK()) {
if (copy_status.ErrorMessage().empty()) {
// The windows execution provider does not return any error message today for CopyTensor since it is
// not implemented yet. That's the reason we're adding our own error message so that we can debug better.
return Status(copy_status.Category(), copy_status.Code(),
"Failed to copy tensor to " + p_tensor->Location().ToString());
}
return copy_status;
} else {
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
return common::Status::OK();
}
}

common::Status SaveInitializedTensors(
const Env& env, const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
const GraphViewer& graph, const AllocatorPtr& default_cpu_alloc,
Expand Down
21 changes: 21 additions & 0 deletions onnxruntime/core/framework/session_state_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,27 @@ common::Status SaveInitializedTensors(
const MemoryProfileFunction& memory_profile_func,
std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors);

common::Status AllocateTensor(
const onnxruntime::MemBuffer* m,
std::unique_ptr<onnxruntime::Tensor>& p_tensor,
const onnxruntime::DataTypeImpl* const& type,
onnxruntime::TensorShape& tensor_shape,
bool use_device_allocator_for_initializers,
const onnxruntime::AllocatorPtr& alloc);

common::Status AllocateTensorOnDeviceOrMemory(
bool use_device_allocator_for_initializers,
onnxruntime::TensorShape& tensor_shape,
const onnxruntime::DataTypeImpl* const& type,
const onnxruntime::AllocatorPtr& alloc,
std::unique_ptr<onnxruntime::Tensor>& p_tensor);

common::Status CopyTensorFromCPUToDevice(
const onnxruntime::DataTransferManager& data_transfer_mgr,
std::unique_ptr<onnxruntime::Tensor>& p_deserialize_tensor,
std::unique_ptr<onnxruntime::Tensor>& p_tensor,
OrtValue& ort_value);

common::Status SaveInputOutputNamesToNodeMapping(const GraphViewer& graph,
SessionState& session_state,
gsl::span<const NodeArg* const> implicit_inputs);
Expand Down

0 comments on commit a0708a0

Please sign in to comment.