Optimize the container

Imlement deserialization path Start graph saving
microsoft · Dec 4, 2024 · df8f630 · df8f630
1 parent 7b0fa40
commit df8f630
Show file tree

Hide file tree

Showing 26 changed files with 814 additions and 197 deletions.
diff --git a/include/onnxruntime/core/framework/buffer_deleter.h b/include/onnxruntime/core/framework/buffer_deleter.h
@@ -5,6 +5,8 @@
 
 #include "core/framework/allocator.h"
 
+#include <functional>
+
 namespace onnxruntime {
 
 // TODO: Do we need this class or is IAllocator::MakeUniquePtr sufficient/better
@@ -31,6 +33,6 @@ class BufferDeleter {
   AllocatorPtr alloc_{nullptr};
 };
 
-using BufferUniquePtr = std::unique_ptr<void, BufferDeleter>;
+using BufferUniquePtr = std::unique_ptr<void, std::function<void(void*)>>;
 using BufferNakedPtr = void*;
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -41,6 +41,7 @@ namespace onnxruntime {
 class Graph;
 struct IndexedSubGraph;
 class Model;
+struct ModelSavingOptions;
 class OpSignature;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1153,29 +1154,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
   ONNX_NAMESPACE::GraphProto ToGraphProto() const;
 
-  // Options to align external initializer offset.
-  // For models running on CPU, ORT will try to use mmap to load external initializers.
-  // To use mmap, external initializer need to be offset aligned.
-  // ORT saves external initializers into signle data file, each initializer is accessed with
-  // offset(start position of initializer) and length(byte length of initializer) of the data file.
-  // To use mmap, each offset need to be aligned which means offset need to divisible by
-  // allocation granularity(64KB for windows and 4K for other OSes).
-  // With align_offset to true, ORT will align offset for large initializer when
-  // save ONNX model with external data file.
-  struct OffsetAlignmentInfo {
-    // Offset will always be page aligned and allocation granularity aligned for mmap support.
-    // This is done by padding previous tensor data with zeros keeping same length.
-    bool align_offset = false;
-    // Alignment threshold for size of data.
-    // Having a low threshold will waste file space for small initializers.
-    // Only when tensor's data size is > the page_align_threshold it will be force aligned.
-    // Default to 1MB.
-    int64_t align_threshold = 1048576;
-    // The allocation Granularity for mmap() support.
-    // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
-    int64_t allocation_granularity = 65536;
-  };
-
   /** Gets the GraphProto representation of this Graph
   @param external_file_path File path of the binary file to use for initializers.
   @param model_file_path path of the model file.
@@ -1186,15 +1164,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                  const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold) const {
-    OffsetAlignmentInfo default_options;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
@@ -1519,6 +1489,13 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
+  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitiallizersImpl(const std::filesystem::path& model_path,
+                                                                       const std::filesystem::path& external_file_path,
+                                                                       const ModelSavingOptions& model_saving_options,
+                                                                       ONNX_NAMESPACE::GraphProto& graph_proto,
+                                                                       std::ostream& external_stream,
+                                                                       int64_t& external_offset) const;
+
 #endif
 
   Version IrVersion() const noexcept {

diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+class PrepackedForSerialization;
+
+// These options that affect how the model initializers are saved.
+// This includes options to align external initializer offset.
+// For models running on CPU, ORT will try to use mmap to load external
+// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT saves external initializers into signle data file, each initializer is
+// accessed with offset(start position of initializer) and length(byte length of
+// initializer) of the data file. To use mmap, each offset need to be aligned
+// which means offset need to divisible by allocation granularity(64KB for
+// windows and 4K for other OSes). With align_offset to true, ORT will align
+// offset for large initializer when save ONNX model with external data file.
+struct ModelSavingOptions {
+  explicit ModelSavingOptions(size_t size_threshold)
+      : initializer_size_threshold(size_threshold) {}
+
+  // Mimimal initializer size in bytes to be externalized on disk
+  size_t initializer_size_threshold;
+  // Offset will always be page aligned and allocation granularity aligned for
+  // mmap support. This is done by padding previous tensor data with zeros
+  // keeping same length.
+  bool align_offset = false;
+  // Alignment threshold for size of data.
+  // Having a low threshold will waste file space for small initializers.
+  // Only when tensor's data size is > the page_align_threshold it will be force
+  // aligned. Default to 1MB.
+  int64_t align_threshold = 1048576;
+  // The allocation Granularity for mmap() support.
+  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+  int64_t allocation_granularity = 65536;
+  // Optional pointer to a container of pre-packed initializers to be
+  // embedded into the external initializers, so they can also be loaded
+  // from disk.
+  const PrepackedForSerialization* prepacked_for_save = nullptr;
+};
+
+}
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -250,6 +250,15 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
+// Use this config when save pre-packed constant initializers to an external data file.
+// This allows to minimize ONNX model file size and memory map pre-packed initializers on
+// model load.
+// - "0": Default is not save pre-packed initializers to a data file.
+// - "1": Save pre-packed constant initializers to an external data file.
+// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers,  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_external_prepacked_constant_initializers";
+
 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)

diff --git a/onnxruntime/core/framework/prepacked_weights.h b/onnxruntime/core/framework/prepacked_weights.h
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "core/common/basic_types.h"
+#include "core/common/inlined_containers_fwd.h"
 #include "core/framework/buffer_deleter.h"
 #include "core/framework/tensor_shape.h"
 
@@ -16,8 +17,8 @@ struct PrePackedWeights final {
   // Hence we hold them in container. It is upto the developer implementing each PrePack()
   // method to define what gets stored in which position of the container.
 
-  std::vector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
-  std::vector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
+  InlinedVector<BufferUniquePtr> buffers_;  // cache pre-packed buffers associated with the kernel
+  InlinedVector<size_t> buffer_sizes_;      // cache sizes of pre-packed buffers (in bytes)
 
   // Produces a hash of the buffers stored in the given instance of this class
   HashValue GetHash() const;

diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc
@@ -3,6 +3,7 @@
 
 #include "core/framework/prepacked_weights_container.h"
 #include "core/framework/allocator_utils.h"
+#include "core/graph/graph.h"
 
 namespace onnxruntime {
 
@@ -49,4 +50,58 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const {
   return prepacked_weights_map_.size();
 }
 
+PrepackedForSerialization::PrepackedForSerialization()
+    : main_graph_(nullptr, key_to_blobs_, false) {
+}
+
+PrepackedForSerialization::~PrepackedForSerialization() = default;
+
+void PrepackedForSerialization::Subgraph::Insert(std::string key, PrePackedWeights&& packed_weight) {
+  auto result = key_to_blobs_.emplace(std::move(key), std::move(packed_weight));
+  ORT_ENFORCE(result.second, "Duplicate pre-packed weight from disk");
+}
+
+bool PrepackedForSerialization::Subgraph::CreateOrOverWrite(const std::string& weight_name, std::string key,
+                                                            PrePackedWeights&& packed_weight) {
+  // We overwrite the existing key. This is necessary in case we already have a pre-packed weight
+  // mapped from disk, but we want to overwrite it with our most recent pre-packed version.
+  auto result = key_to_blobs_.insert_or_assign(std::move(key), std::move(packed_weight));
+  weight_to_pre_packs_[weight_name].push_back(result.first);
+  return result.second;
+}
+
+const PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) const {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+std::optional<PrePackedWeights> PrepackedForSerialization::TakePrepackedWeights(const std::string& key) {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    return std::nullopt;
+  }
+  PrePackedWeights result = std::move(it->second);
+  key_to_blobs_.erase(it);
+  return result;
+}
+
+PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreateSubgraph(const Graph& graph) {
+  if (graph.ParentGraph() == nullptr) {
+    return main_graph_;
+  }
+  auto& parent = FindOrCreateSubgraph(*graph.ParentGraph());
+  return parent.GetOrCreateSubgraph(graph);
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h
@@ -3,19 +3,25 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <unordered_set>
-#include <string>
-#include <cstdint>
-
-#include "core/framework/buffer_deleter.h"
-
+#include "core/common/common.h"
 #include "core/framework/allocator.h"
-#include <mutex>
 #include "prepacked_weights.h"
 
+#include <cstdint>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+
 namespace onnxruntime {
 
+#ifndef SHARED_PROVIDER
+class Graph;
+#else
+struct Graph;
+#endif
+
 class PrepackedWeightsContainer final {
  public:
   PrepackedWeightsContainer() {
@@ -66,4 +72,120 @@ class PrepackedWeightsContainer final {
   std::unordered_map<std::string, PrePackedWeights> prepacked_weights_map_;
 };
 
+/// <summary>
+/// This class has a dual purpose.
+/// When saving to disk is ON (IsOverWriteForSave() true)
+/// it provides a storage container for PrePackedWeights instances. The pre-packed
+/// data is collected using PrepackConstaitInitializers(). In this case newly pre-pack
+/// data is used for writing to disk, unless old data matches.
+///
+/// If saving is OFF, it is used to contain the weights memory mapped from disk.
+/// Those weights are then moved to the shared container if weight sharing is enabled.
+/// And also the interested kernels.
+/// </summary>
+class PrepackedForSerialization final {
+ public:
+  explicit PrepackedForSerialization();
+  ~PrepackedForSerialization();
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedForSerialization);
+
+  using KeyToBlobMap = std::unordered_map<std::string, PrePackedWeights>;
+  using KeyToBlobMapIterator = KeyToBlobMap::iterator;
+  using BlobsInderect = std::vector<KeyToBlobMapIterator>;
+  using BlobsConstIterator = BlobsInderect::const_iterator;
+
+  // Maps weight name to iterators in key_to_blobs_. It associates a weight name with its pre-packs.
+  // Normally, a single weight produces a single PrePackedWeights. But it is possible that a weight
+  // is pre-packed by different kernels.
+  using WeightToPrePacksMap = std::unordered_map<std::string, BlobsInderect>;
+
+  class Subgraph {
+   public:
+    Subgraph(Subgraph* par, KeyToBlobMap& key_blobs, bool overwrite_for_save)
+        : save_mode_on_(overwrite_for_save), parent_(par), key_to_blobs_(key_blobs) {
+    }
+
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Subgraph);
+
+    Subgraph* Parent() noexcept {
+      return parent_;
+    }
+
+    Subgraph& GetOrCreateSubgraph(const Graph& graph) {
+      auto result = subgraph_prepacks_.emplace(&graph, nullptr);
+      if (result.second) {
+        result.first->second = std::make_unique<Subgraph>(this, key_to_blobs_, save_mode_on_);
+      }
+      return *result.first->second;
+    }
+
+    const Subgraph* GetSubgraph(const Graph& graph) const {
+      auto it = subgraph_prepacks_.find(&graph);
+      return it == subgraph_prepacks_.end() ? nullptr : it->second.get();
+    }
+
+    Subgraph* GetSubgraph(const Graph& graph) {
+      auto it = subgraph_prepacks_.find(&graph);
+      return it == subgraph_prepacks_.end() ? nullptr : it->second.get();
+    }
+
+    // This does not populate per-initializer structures.
+    void Insert(std::string key, PrePackedWeights&& packed_weight);
+
+    bool CreateOrOverWrite(const std::string& weight_name, std::string key,
+                           PrePackedWeights&& packed_weight);
+
+    const PrePackedWeights* GetPrepackedWeights(const std::string& key) const;
+
+    PrePackedWeights* GetPrepackedWeights(const std::string& key);
+
+    bool IsSaveModeOn() const noexcept {
+      return save_mode_on_;
+    }
+
+    void SetSaveMode(bool value) noexcept {
+      save_mode_on_ = value;
+    }
+
+   private:
+    bool save_mode_on_;
+    Subgraph* parent_ = nullptr;
+    KeyToBlobMap& key_to_blobs_;
+    WeightToPrePacksMap weight_to_pre_packs_;
+    // Map Graph ptr to subgraphs
+    std::unordered_map<const Graph*, std::unique_ptr<Subgraph>> subgraph_prepacks_;
+  };
+
+  const Subgraph& MainGraph() const noexcept {
+    return main_graph_;
+  }
+
+  Subgraph& MainGraph() noexcept {
+    return main_graph_;
+  }
+
+  size_t GetNumberOfKeyedBlobs() const noexcept {
+    return key_to_blobs_.size();
+  }
+
+  void SetSaveMode(bool value) noexcept {
+    main_graph_.SetSaveMode(value);
+  }
+
+  bool IsSaveModeOn() const noexcept {
+    return main_graph_.IsSaveModeOn();
+  }
+
+  std::optional<PrePackedWeights> TakePrepackedWeights(const std::string& key);
+
+  Subgraph& FindOrCreateSubgraph(const Graph& graph);
+
+ private:
+  // Map of key to pre-packed blobs.This is common for all subgraphs
+  // The key is : op_type + "+" + hash_of_prepacked_buffers_in_the_PrepackedWeights_instance.
+  // as defined above. We store keys for all scopes (main graph and subgraphs)
+  KeyToBlobMap key_to_blobs_;
+  Subgraph main_graph_;
+};
 }  // namespace onnxruntime