avoid redundant memory allocation for external initializers (#21682)

### Description avoid redundant memory allocation for external initializers, we will use mmap for external initializers later so no point to allocate memory in advance then release them later. ### Motivation and Context In current implementation, we will: 1. Allocate memory (with desired size of current initializer) for initializer first: [https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/session_state_utils.cc#L131](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fmicrosoft%2Fonnxruntime%2Fblob%2Fmain%2Fonnxruntime%2Fcore%2Fframework%2Fsession_state_utils.cc%23L131&data=05%7C02%7Cfrdong%40microsoft.com%7C1e126797c95149aa217d08dcb781cc60%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638587015340041125%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=6fN57MUsergrCX%2BBS7jztWBRmc8nx19EVvn0lUJ2Gtk%3D&reserved=0) 2. For external initializer, we will point initializer to mmaped object in memory and release previously allocated tensor: [https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/session_state_utils.cc#L89](https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fmicrosoft%2Fonnxruntime%2Fblob%2Fmain%2Fonnxruntime%2Fcore%2Fframework%2Fsession_state_utils.cc%23L89&data=05%7C02%7Cfrdong%40microsoft.com%7C1e126797c95149aa217d08dcb781cc60%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638587015340054491%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=yBtXLc%2Bhpx3IT1%2FX0664foqQ5X5O%2Fy5XNhj4Oed%2BAt4%3D&reserved=0) For large models, we are keep allocating and release memory for external initializers which seems unnecessary. For phi silica model, with this change we can reduce transient memory usage from 4,566MB to 2,724MB. Since these redundant memory is released quickly when we mmap external initializers so this change has no much impact on peak memory usage.
microsoft · Aug 14, 2024 · a0708a0 · a0708a0
1 parent 7172aff
commit a0708a0
Show file tree

Hide file tree

Showing 2 changed files with 139 additions and 57 deletions.
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
@@ -113,28 +113,14 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
   TensorShape tensor_shape = utils::GetTensorShapeFromTensorProto(tensor_proto);
   const DataTypeImpl* const type = DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto.data_type())->GetElementType();
   std::unique_ptr<Tensor> p_tensor;
-  if (m != nullptr) {
-    p_tensor = std::make_unique<Tensor>(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo());
-    if (m->GetLen() < p_tensor->SizeInBytes()) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error. The preallocated buffer is too small. Requires ",
-                             p_tensor->SizeInBytes(), ", Got ", m->GetLen());
-    }
-  } else {
-    if (use_device_allocator_for_initializers) {
-      void* tensor_buffer = nullptr;
-      ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, alloc, tensor_buffer));
-      p_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, alloc);
-    } else {
-      // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
-      // (may expand it if there isn't a chunk that can be allotted to the memory request).
-      // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
-      p_tensor = std::make_unique<Tensor>(type, tensor_shape, alloc);
-    }
-  }
 
-  if (p_tensor->Location().device.Type() == OrtDevice::CPU) {
-    // deserialize directly to CPU tensor
-    if (utils::HasExternalData(tensor_proto)) {
+  auto device_type = (alloc != nullptr) ? alloc->Info().device.Type() : m->GetAllocInfo().device.Type();
+
+  if (utils::HasExternalData(tensor_proto)) {
+    if (device_type == OrtDevice::CPU) {
+      // for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance
+      p_tensor = std::make_unique<Tensor>(type, TensorShape(), alloc);
+
       // NB: The file containing external data for the tensor is mmap'd. If the tensor will be used on CPU we can
       // utilize the mmap'd buffer directly by calling ExtDataTensorProtoToTensor. If we called
       // TensorProtoToTensor it would copy the data, causing unnecessary overhead
@@ -143,57 +129,132 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
                                                      ext_data_deleter, buffered_tensor));
 
       ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()};
-
       MLDataType ml_tensor_type = DataTypeImpl::GetType<Tensor>();
       ort_value.Init(p_tensor.release(), ml_tensor_type, deleter);
       return common::Status::OK();
-    }
-    ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_tensor));
-  } else {  // non-cpu tensor
-    if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
-    }
+    } else {  // non-cpu tensor
+      if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
+      }
 
-    // deserialize to CPU first for non-CPU allocator, then copy
-    std::unique_ptr<Tensor> p_deserialize_tensor;
-    if (use_device_allocator_for_initializers) {
-      void* tensor_buffer = nullptr;
-      ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, default_cpu_alloc, tensor_buffer));
-      p_deserialize_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, default_cpu_alloc);
-    } else {
-      // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
-      // (may expand it if there isn't a chunk that can be allotted to the memory request).
-      // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
-      p_deserialize_tensor = std::make_unique<Tensor>(type, tensor_shape, default_cpu_alloc);
-    }
+      // deserialize to CPU first for non-CPU allocator, then copy to device
+      // for external initializer load on non-CPU device:
+      // 1. allocate memory on device - p_tensor
+      // 2. load initializer into CPU memory - p_deserialize_tensor,
+      //    we will use mmap so no need to allocate memory on CPU in advance
+      // 3. copy tensor from CPU to device - p_deserialize_tensor -> p_tensor
+      auto allocate_on_device_status = AllocateTensor(m, p_tensor, type, tensor_shape, use_device_allocator_for_initializers, alloc);
+      if (!allocate_on_device_status.IsOK()) {
+        return allocate_on_device_status;
+      }
+
+      std::unique_ptr<Tensor> p_deserialize_tensor = std::make_unique<Tensor>(type, TensorShape(), default_cpu_alloc);
 
-    OrtCallback ext_data_deleter;
-    std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
-    if (utils::HasExternalData(tensor_proto)) {
+      OrtCallback ext_data_deleter;
+      std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor,
                                                      ext_data_deleter, buffered_tensor));
       scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter);
-    } else {
-      ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_deserialize_tensor));
+      // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
+
+      return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
     }
-    // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
-
-    Status copy_status = data_transfer_mgr.CopyTensor(*p_deserialize_tensor, *p_tensor);
-    if (!copy_status.IsOK()) {
-      if (copy_status.ErrorMessage().empty()) {
-        // The windows execution provider does not return any error message today for CopyTensor since it is
-        // not implemented yet. That's the reason we're adding our own error message so that we can debug better.
-        return Status(copy_status.Category(), copy_status.Code(),
-                      "Failed to copy tensor to " + p_tensor->Location().ToString());
+  } else {
+    // for internal initializer, always allocate memory on device - p_tensor
+    auto allocate_on_device_status = AllocateTensor(m, p_tensor, type, tensor_shape, use_device_allocator_for_initializers, alloc);
+    if (!allocate_on_device_status.IsOK()) {
+      return allocate_on_device_status;
+    }
+
+    if (device_type == OrtDevice::CPU) {
+      // deserialize directly to CPU tensor
+      ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_tensor));
+      auto ml_tensor = DataTypeImpl::GetType<Tensor>();
+      ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
+      return common::Status::OK();
+    } else {  // non-cpu tensor
+      if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
+      }
+
+      // deserialize to CPU first for non-CPU allocator, then copy
+      // for internal initializer
+      // 1. allocate memory on CPU - p_deserialize_tensor
+      // 2. deserialize tensor_probo into a preallocated tensor (p_deserialize_tensor)
+      // 3. copy tensor from CPU to device - p_deserialize_tensor -> p_tensor
+      std::unique_ptr<Tensor> p_deserialize_tensor;
+      auto allocate_on_cpu_status = AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, default_cpu_alloc, p_deserialize_tensor);
+      if (!allocate_on_cpu_status.IsOK()) {
+        return allocate_on_cpu_status;
       }
-      return copy_status;
+
+      ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_deserialize_tensor));
+      // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
+
+      return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
+    }
+  }
+}
+
+common::Status AllocateTensor(
+    const onnxruntime::MemBuffer* m,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+    const onnxruntime::DataTypeImpl* const& type,
+    onnxruntime::TensorShape& tensor_shape,
+    bool use_device_allocator_for_initializers,
+    const onnxruntime::AllocatorPtr& alloc) {
+  if (m != nullptr) {
+    p_tensor = std::make_unique<Tensor>(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo());
+    if (m->GetLen() < p_tensor->SizeInBytes()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error. The preallocated buffer is too small. Requires ",
+                             p_tensor->SizeInBytes(), ", Got ", m->GetLen());
     }
+  } else {
+    return AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, alloc, p_tensor);
   }
-  auto ml_tensor = DataTypeImpl::GetType<Tensor>();
-  ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
   return common::Status::OK();
 }
 
+common::Status AllocateTensorOnDeviceOrMemory(
+    bool use_device_allocator_for_initializers,
+    onnxruntime::TensorShape& tensor_shape,
+    const onnxruntime::DataTypeImpl* const& type,
+    const onnxruntime::AllocatorPtr& alloc,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor) {
+  if (use_device_allocator_for_initializers) {
+    void* tensor_buffer = nullptr;
+    ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, alloc, tensor_buffer));
+    p_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, alloc);
+  } else {
+    // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
+    // (may expand it if there isn't a chunk that can be allotted to the memory request).
+    // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
+    p_tensor = std::make_unique<Tensor>(type, tensor_shape, alloc);
+  }
+  return common::Status::OK();
+}
+
+common::Status CopyTensorFromCPUToDevice(
+    const onnxruntime::DataTransferManager& data_transfer_mgr,
+    std::unique_ptr<onnxruntime::Tensor>& p_deserialize_tensor,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+    OrtValue& ort_value) {
+  Status copy_status = data_transfer_mgr.CopyTensor(*p_deserialize_tensor, *p_tensor);
+  if (!copy_status.IsOK()) {
+    if (copy_status.ErrorMessage().empty()) {
+      // The windows execution provider does not return any error message today for CopyTensor since it is
+      // not implemented yet. That's the reason we're adding our own error message so that we can debug better.
+      return Status(copy_status.Category(), copy_status.Code(),
+                    "Failed to copy tensor to " + p_tensor->Location().ToString());
+    }
+    return copy_status;
+  } else {
+    auto ml_tensor = DataTypeImpl::GetType<Tensor>();
+    ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
+    return common::Status::OK();
+  }
+}
+
 common::Status SaveInitializedTensors(
     const Env& env, const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
     const GraphViewer& graph, const AllocatorPtr& default_cpu_alloc,

diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h
@@ -50,6 +50,27 @@ common::Status SaveInitializedTensors(
     const MemoryProfileFunction& memory_profile_func,
     std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors);
 
+common::Status AllocateTensor(
+    const onnxruntime::MemBuffer* m,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+    const onnxruntime::DataTypeImpl* const& type,
+    onnxruntime::TensorShape& tensor_shape,
+    bool use_device_allocator_for_initializers,
+    const onnxruntime::AllocatorPtr& alloc);
+
+common::Status AllocateTensorOnDeviceOrMemory(
+    bool use_device_allocator_for_initializers,
+    onnxruntime::TensorShape& tensor_shape,
+    const onnxruntime::DataTypeImpl* const& type,
+    const onnxruntime::AllocatorPtr& alloc,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor);
+
+common::Status CopyTensorFromCPUToDevice(
+    const onnxruntime::DataTransferManager& data_transfer_mgr,
+    std::unique_ptr<onnxruntime::Tensor>& p_deserialize_tensor,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+    OrtValue& ort_value);
+
 common::Status SaveInputOutputNamesToNodeMapping(const GraphViewer& graph,
                                                  SessionState& session_state,
                                                  gsl::span<const NodeArg* const> implicit_inputs);