Implement pre-packed blobs serialiation on disk

and pre-packed blobs sharing when weights sharing is not enabled. Memory map pre-packed blobs. Recurse into subgraphs in ToGraphProtoWithExternalInitializers to make sure all big weights are serialized along with their pre-packs that is to be shared between the subgraphs.
microsoft · Dec 10, 2024 · 938bccf · 938bccf
1 parent bf4d3e1
commit 938bccf
Show file tree

Hide file tree

Showing 28 changed files with 1,353 additions and 246 deletions.
diff --git a/include/onnxruntime/core/framework/buffer_deleter.h b/include/onnxruntime/core/framework/buffer_deleter.h
@@ -5,6 +5,8 @@
 
 #include "core/framework/allocator.h"
 
+#include <functional>
+
 namespace onnxruntime {
 
 // TODO: Do we need this class or is IAllocator::MakeUniquePtr sufficient/better
@@ -31,6 +33,6 @@ class BufferDeleter {
   AllocatorPtr alloc_{nullptr};
 };
 
-using BufferUniquePtr = std::unique_ptr<void, BufferDeleter>;
+using BufferUniquePtr = std::unique_ptr<void, std::function<void(void*)>>;
 using BufferNakedPtr = void*;
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -19,10 +19,10 @@
 #include "core/common/common.h"
 #include "core/common/path_string.h"
 #include "core/common/const_pointer_container.h"
+#include "core/common/inlined_containers_fwd.h"
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/common/inlined_containers.h"
 #endif
-#include "core/common/inlined_containers_fwd.h"
 #include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
@@ -41,6 +41,7 @@ namespace onnxruntime {
 class Graph;
 struct IndexedSubGraph;
 class Model;
+struct ModelSavingOptions;
 class OpSignature;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1153,29 +1154,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
   ONNX_NAMESPACE::GraphProto ToGraphProto() const;
 
-  // Options to align external initializer offset.
-  // For models running on CPU, ORT will try to use mmap to load external initializers.
-  // To use mmap, external initializer need to be offset aligned.
-  // ORT saves external initializers into signle data file, each initializer is accessed with
-  // offset(start position of initializer) and length(byte length of initializer) of the data file.
-  // To use mmap, each offset need to be aligned which means offset need to divisible by
-  // allocation granularity(64KB for windows and 4K for other OSes).
-  // With align_offset to true, ORT will align offset for large initializer when
-  // save ONNX model with external data file.
-  struct OffsetAlignmentInfo {
-    // Offset will always be page aligned and allocation granularity aligned for mmap support.
-    // This is done by padding previous tensor data with zeros keeping same length.
-    bool align_offset = false;
-    // Alignment threshold for size of data.
-    // Having a low threshold will waste file space for small initializers.
-    // Only when tensor's data size is > the page_align_threshold it will be force aligned.
-    // Default to 1MB.
-    int64_t align_threshold = 1048576;
-    // The allocation Granularity for mmap() support.
-    // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
-    int64_t allocation_granularity = 65536;
-  };
-
   /** Gets the GraphProto representation of this Graph
   @param external_file_path File path of the binary file to use for initializers.
   @param model_file_path path of the model file.
@@ -1186,15 +1164,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                  const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold) const {
-    OffsetAlignmentInfo default_options;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
@@ -1519,6 +1489,28 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
+  /// <summary>
+  /// A map that is used to keep track of pre-packed blobs to be serialized
+  /// The implementation adds pre-packed external data references to the TensorProto
+  /// that contains the initializer data. However, it may be an outerscope initializer.
+  /// Thus we need to keep track of the pre-packed blobs that are not serialized in this
+  /// graph, so the parent can make sure it is being serialized.
+  ///
+  /// The below map has <weight_name, std::vector<blob_key_name>>. This contains
+  /// the entries that are not serialized in this graph, and the parent must check in them
+  /// </summary>
+  using WeightToPrePacksMap = NodeHashMap<std::string, InlinedHashSet<std::string>>;
+
+  Status ToGraphProtoWithExternalInitiallizersImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& modified_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      WeightToPrePacksMap& unprocessed_prepacks,
+      ONNX_NAMESPACE::GraphProto& graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
+
 #endif
 
   Version IrVersion() const noexcept {

diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+class PrepackedForSerialization;
+
+// These options that affect how the model initializers are saved.
+// This includes options to align external initializer offset.
+// For models running on CPU, ORT will try to use mmap to load external
+// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT saves external initializers into signle data file, each initializer is
+// accessed with offset(start position of initializer) and length(byte length of
+// initializer) of the data file. To use mmap, each offset need to be aligned
+// which means offset need to divisible by allocation granularity(64KB for
+// windows and 4K for other OSes). With align_offset to true, ORT will align
+// offset for large initializer when save ONNX model with external data file.
+struct ModelSavingOptions {
+  explicit ModelSavingOptions(size_t size_threshold)
+      : initializer_size_threshold(size_threshold) {}
+
+  // Mimimal initializer size in bytes to be externalized on disk
+  size_t initializer_size_threshold;
+  // Offset will always be page aligned and allocation granularity aligned for
+  // mmap support. This is done by padding previous tensor data with zeros
+  // keeping same length.
+  bool align_offset = false;
+  // Alignment threshold for size of data.
+  // Having a low threshold will waste file space for small initializers.
+  // Only when tensor's data size is > the page_align_threshold it will be force
+  // aligned. Default to 1MB.
+  int64_t align_threshold = 1048576;
+  // The allocation Granularity for mmap() support.
+  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+  int64_t allocation_granularity = 65536;
+  // Optional pointer to a container of pre-packed initializers to be
+  // embedded into the external initializers, so they can also be loaded
+  // from disk.
+  const PrepackedForSerialization* prepacked_for_save = nullptr;
+};
+
+}  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -250,6 +250,15 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
+// Use this config when save pre-packed constant initializers to an external data file.
+// This allows to minimize ONNX model file size and memory map pre-packed initializers on
+// model load.
+// - "0": Default is not save pre-packed initializers to a data file.
+// - "1": Save pre-packed constant initializers to an external data file.
+// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers,  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_external_prepacked_constant_initializers";
+
 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)

diff --git a/onnxruntime/core/framework/prepacked_weights.h b/onnxruntime/core/framework/prepacked_weights.h
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "core/common/basic_types.h"
+#include "core/common/inlined_containers_fwd.h"
 #include "core/framework/buffer_deleter.h"
 #include "core/framework/tensor_shape.h"
 
@@ -16,11 +17,14 @@ struct PrePackedWeights final {
   // Hence we hold them in container. It is upto the developer implementing each PrePack()
   // method to define what gets stored in which position of the container.
 
-  std::vector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
-  std::vector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
+  InlinedVector<BufferUniquePtr> buffers_;  // cache pre-packed buffers associated with the kernel
+  InlinedVector<size_t> buffer_sizes_;      // cache sizes of pre-packed buffers (in bytes)
 
   // Produces a hash of the buffers stored in the given instance of this class
   HashValue GetHash() const;
+
+  // The function creates a copy with non-owning BufferUniquePtrs.
+  PrePackedWeights CreateReferringCopy() const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc
@@ -3,9 +3,21 @@
 
 #include "core/framework/prepacked_weights_container.h"
 #include "core/framework/allocator_utils.h"
+#include "core/graph/graph.h"
 
 namespace onnxruntime {
 
+PrePackedWeights PrePackedWeights::CreateReferringCopy() const {
+  PrePackedWeights copy;
+  for (const auto& prepacked_buffer : buffers_) {
+    // BufferDeleter is nullptr because we do not own the data in this case
+    copy.buffers_.emplace_back(prepacked_buffer.get(), BufferDeleter(nullptr));
+  }
+
+  copy.buffer_sizes_ = buffer_sizes_;
+  return copy;
+}
+
 AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) {
   auto iter = allocators_.find(device_name);
 
@@ -49,4 +61,87 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const {
   return prepacked_weights_map_.size();
 }
 
+PrepackedForSerialization::PrepackedForSerialization()
+    : main_graph_(nullptr, key_to_blobs_, false) {
+}
+
+PrepackedForSerialization::~PrepackedForSerialization() = default;
+
+void PrepackedForSerialization::Subgraph::InsertFromDisk(const std::string& key, PrePackedWeights&& packed_weight) {
+  // We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and
+  // up the tree by the same kernel with the same result. The map prevents this from happening.
+  key_to_blobs_.emplace(key, std::move(packed_weight));
+}
+
+void PrepackedForSerialization::Subgraph::WritePacked(const std::string& weight_name, const std::string& key,
+                                                      PrePackedWeights&& packed_weight) {
+  auto hit = key_to_blobs_.find(key);
+  if (hit == key_to_blobs_.end()) {
+    // new key
+    key_to_blobs_.emplace(key, std::move(packed_weight));
+    if (save_mode_on_) {
+      sorted_by_weight_for_writing_[weight_name].insert(key);
+    }
+    return;
+  }
+
+  // Key existed, but may or may not have a reference in this subgraph
+  if (save_mode_on_) {
+    auto& list = sorted_by_weight_for_writing_[weight_name];
+    list.insert(key);
+  }
+  hit->second = std::move(packed_weight);
+}
+
+const PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) const {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+std::optional<PrePackedWeights> PrepackedForSerialization::Subgraph::ReplaceWithReferenceIfSaving(
+    const std::string& weight_name,
+    const std::string& key,
+    const PrePackedWeights& refer_if_absent) {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    if (save_mode_on_) {
+      key_to_blobs_.emplace(key, refer_if_absent.CreateReferringCopy());
+      sorted_by_weight_for_writing_[weight_name].insert(key);
+    }
+    return std::nullopt;
+  }
+
+  PrePackedWeights result = std::move(it->second);
+  if (save_mode_on_) {
+    it->second = result.CreateReferringCopy();
+    auto& list = sorted_by_weight_for_writing_[weight_name];
+    list.insert(key);
+  } else {
+    key_to_blobs_.erase(it);
+  }
+  return result;
+}
+
+PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreatePrepackedGraph(const Graph& graph) {
+  if (graph.ParentGraph() == nullptr) {
+    return main_graph_;
+  }
+  auto& parent = FindOrCreatePrepackedGraph(*graph.ParentGraph());
+  return parent.GetOrCreateSubgraph(graph);
+}
+
+const PrepackedForSerialization::Subgraph* PrepackedForSerialization::FindPrepackedGraph(const Graph& graph) const {
+  if (graph.ParentGraph() == nullptr) {
+    return &main_graph_;
+  }
+  auto* parent = FindPrepackedGraph(*graph.ParentGraph());
+  if (parent != nullptr) {
+    parent = parent->GetSubgraph(graph);
+  }
+  return parent;
+}
+
 }  // namespace onnxruntime