Finished serialization.

Certain kernels do not share their pre-packed weights, they simply keep them to themselves. TODO: Make them share.
microsoft · Dec 5, 2024 · 0584f09 · 0584f09
1 parent df8f630
commit 0584f09
Show file tree

Hide file tree

Showing 9 changed files with 176 additions and 173 deletions.
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -1489,12 +1489,14 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitiallizersImpl(const std::filesystem::path& model_path,
-                                                                       const std::filesystem::path& external_file_path,
-                                                                       const ModelSavingOptions& model_saving_options,
-                                                                       ONNX_NAMESPACE::GraphProto& graph_proto,
-                                                                       std::ostream& external_stream,
-                                                                       int64_t& external_offset) const;
+  Status ToGraphProtoWithExternalInitiallizersImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& modified_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      ONNX_NAMESPACE::GraphProto& graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
 
 #endif
 

diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
@@ -41,4 +41,4 @@ struct ModelSavingOptions {
   const PrepackedForSerialization* prepacked_for_save = nullptr;
 };
 
-}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc
@@ -56,18 +56,21 @@ PrepackedForSerialization::PrepackedForSerialization()
 
 PrepackedForSerialization::~PrepackedForSerialization() = default;
 
-void PrepackedForSerialization::Subgraph::Insert(std::string key, PrePackedWeights&& packed_weight) {
+void PrepackedForSerialization::Subgraph::InsertFromDisk(std::string key, PrePackedWeights&& packed_weight) {
   auto result = key_to_blobs_.emplace(std::move(key), std::move(packed_weight));
   ORT_ENFORCE(result.second, "Duplicate pre-packed weight from disk");
 }
 
-bool PrepackedForSerialization::Subgraph::CreateOrOverWrite(const std::string& weight_name, std::string key,
-                                                            PrePackedWeights&& packed_weight) {
-  // We overwrite the existing key. This is necessary in case we already have a pre-packed weight
-  // mapped from disk, but we want to overwrite it with our most recent pre-packed version.
-  auto result = key_to_blobs_.insert_or_assign(std::move(key), std::move(packed_weight));
-  weight_to_pre_packs_[weight_name].push_back(result.first);
-  return result.second;
+bool PrepackedForSerialization::Subgraph::WritePackedForSaving(const std::string& weight_name, const std::string& key,
+                                                               PrePackedWeights&& packed_weight) {
+  auto hit = key_to_blobs_.find(key);
+  if (hit == key_to_blobs_.end()) {
+    auto result = key_to_blobs_.insert({key, std::move(packed_weight)});
+    sorted_by_weight_for_writing_[weight_name].push_back(result.first);
+    return true;
+  }
+  hit->second = std::move(packed_weight);
+  return false;
 }
 
 const PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) const {
@@ -96,12 +99,23 @@ std::optional<PrePackedWeights> PrepackedForSerialization::TakePrepackedWeights(
   return result;
 }
 
-PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreateSubgraph(const Graph& graph) {
+PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreatePrepackedGraph(const Graph& graph) {
   if (graph.ParentGraph() == nullptr) {
     return main_graph_;
   }
-  auto& parent = FindOrCreateSubgraph(*graph.ParentGraph());
+  auto& parent = FindOrCreatePrepackedGraph(*graph.ParentGraph());
   return parent.GetOrCreateSubgraph(graph);
 }
 
+const PrepackedForSerialization::Subgraph* PrepackedForSerialization::FindPrepackedGraph(const Graph& graph) const {
+  if (graph.ParentGraph() == nullptr) {
+    return &main_graph_;
+  }
+  auto* parent = FindPrepackedGraph(*graph.ParentGraph());
+  if (parent != nullptr) {
+    parent = parent->GetSubgraph(graph);
+  }
+  return parent;
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h
@@ -91,8 +91,8 @@ class PrepackedForSerialization final {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedForSerialization);
 
   using KeyToBlobMap = std::unordered_map<std::string, PrePackedWeights>;
-  using KeyToBlobMapIterator = KeyToBlobMap::iterator;
-  using BlobsInderect = std::vector<KeyToBlobMapIterator>;
+  using KeyToBlobMapConstIterator = KeyToBlobMap::const_iterator;
+  using BlobsInderect = std::vector<KeyToBlobMapConstIterator>;
   using BlobsConstIterator = BlobsInderect::const_iterator;
 
   // Maps weight name to iterators in key_to_blobs_. It associates a weight name with its pre-packs.
@@ -130,11 +130,10 @@ class PrepackedForSerialization final {
       return it == subgraph_prepacks_.end() ? nullptr : it->second.get();
     }
 
-    // This does not populate per-initializer structures.
-    void Insert(std::string key, PrePackedWeights&& packed_weight);
+    void InsertFromDisk(std::string key, PrePackedWeights&& packed_weight);
 
-    bool CreateOrOverWrite(const std::string& weight_name, std::string key,
-                           PrePackedWeights&& packed_weight);
+    bool WritePackedForSaving(const std::string& weight_name, const std::string& key,
+                              PrePackedWeights&& packed_weight);
 
     const PrePackedWeights* GetPrepackedWeights(const std::string& key) const;
 
@@ -148,11 +147,20 @@ class PrepackedForSerialization final {
       save_mode_on_ = value;
     }
 
+    // Returns iterators to key->blob pair for writing
+    const BlobsInderect* GetBlobsForWeight(const std::string& weight_name) const {
+      auto hit = sorted_by_weight_for_writing_.find(weight_name);
+      if (hit != sorted_by_weight_for_writing_.end()) {
+        return &hit->second;
+      }
+      return nullptr;
+    }
+
    private:
     bool save_mode_on_;
     Subgraph* parent_ = nullptr;
     KeyToBlobMap& key_to_blobs_;
-    WeightToPrePacksMap weight_to_pre_packs_;
+    WeightToPrePacksMap sorted_by_weight_for_writing_;
     // Map Graph ptr to subgraphs
     std::unordered_map<const Graph*, std::unique_ptr<Subgraph>> subgraph_prepacks_;
   };
@@ -179,7 +187,9 @@ class PrepackedForSerialization final {
 
   std::optional<PrePackedWeights> TakePrepackedWeights(const std::string& key);
 
-  Subgraph& FindOrCreateSubgraph(const Graph& graph);
+  Subgraph& FindOrCreatePrepackedGraph(const Graph& graph);
+
+  const Subgraph* FindPrepackedGraph(const Graph& graph) const;
 
  private:
   // Map of key to pre-packed blobs.This is common for all subgraphs

diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
@@ -387,19 +387,18 @@ static Status KernelUseSharedPrePackedBuffers(OpKernel& kernel, int input_idx,
   return Status::OK();
 }
 
-// Here we use the data that is owned by somebody else
-static void SavePrepackedDataForWriting(const std::string& weight_name,
-                                        const std::string& key,
-                                        const PrePackedWeights& prepacked_weights,
-                                        PrepackedForSerialization::Subgraph& prepacked_subgraph) {
+static void WritePrepackedForSaving(const std::string& weight_name,
+                                    const std::string& key,
+                                    const PrePackedWeights& prepacked_weights,
+                                    PrepackedForSerialization::Subgraph& prepacked_subgraph) {
   PrePackedWeights weights_for_saving;
   for (const auto& prepacked_buffer : prepacked_weights.buffers_) {
-    // BufferDeleter is nullptr because we do not own the data
+    // BufferDeleter is nullptr because we do not own the data in this case
     weights_for_saving.buffers_.emplace_back(prepacked_buffer.get(), BufferDeleter(nullptr));
   }
 
   weights_for_saving.buffer_sizes_ = prepacked_weights.buffer_sizes_;
-  prepacked_subgraph.CreateOrOverWrite(weight_name, key, std::move(weights_for_saving));
+  prepacked_subgraph.WritePackedForSaving(weight_name, key, std::move(weights_for_saving));
 }
 
 static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type,
@@ -417,7 +416,7 @@ Status SessionState::PrepackConstantInitializedTensors(
     const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
   auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map](
                                         bool should_cache_prepacked_weights_for_shared_initializers) -> Status {
-    auto& prepacked_subgraph = prepacked_weights_for_serialization_.FindOrCreateSubgraph(graph_);
+    auto& prepacked_subgraph = prepacked_weights_for_serialization_.FindOrCreatePrepackedGraph(graph_);
 
     for (auto& node : GetGraphViewer().Nodes()) {
       auto kernel = GetMutableKernel(node.Index());
@@ -492,8 +491,8 @@ Status SessionState::PrepackConstantInitializedTensors(
                       if (prepacked_weights_for_serialization_.IsSaveModeOn()) {
                         // Here we take references to the shared container owned data, so we unmap any entries
                         // that we are mapping from disk
-                        SavePrepackedDataForWriting(input_name, prepacked_weights_container_key, prepacked_shared,
-                                                    prepacked_subgraph);
+                        WritePrepackedForSaving(input_name, prepacked_weights_container_key, prepacked_shared,
+                                                prepacked_subgraph);
                       }
 
                     } else {  // container doesn't contain the pre-packed weight - so write into it for sharing across kernel instances
@@ -523,8 +522,8 @@ Status SessionState::PrepackConstantInitializedTensors(
                       if (prepacked_weights_for_serialization_.IsSaveModeOn()) {
                         // Here we take references to the shared container owned data, so we unmap any entries
                         // that we are mapping from disk, so we write the most fresh data possible
-                        SavePrepackedDataForWriting(input_name, prepacked_weights_container_key, shared_prepacked,
-                                                    prepacked_subgraph);
+                        WritePrepackedForSaving(input_name, prepacked_weights_container_key, shared_prepacked,
+                                                prepacked_subgraph);
                       }
                     }
                   }
@@ -554,8 +553,8 @@ Status SessionState::PrepackConstantInitializedTensors(
 
                     if (prepacked_subgraph.IsSaveModeOn() || weights_to_use == nullptr) {
                       // In this case pre-packed container owns the data
-                      prepacked_subgraph.CreateOrOverWrite(input_name, prepacked_weights_container_key,
-                                                           std::move(weights_to_be_filled_in));
+                      prepacked_subgraph.WritePackedForSaving(input_name, prepacked_weights_container_key,
+                                                              std::move(weights_to_be_filled_in));
                       weights_to_use = prepacked_subgraph.GetPrepackedWeights(prepacked_weights_container_key);
                       assert(weights_to_use != nullptr);
                     }

diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc
@@ -4,6 +4,7 @@
 #include "tensor_external_data_info.h"
 #include "core/common/common.h"
 #include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/common/string_utils.h"
 #include "core/platform/path_lib.h"
 
@@ -54,8 +55,9 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
     } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) {
       out->checksum_ = stringmap.value();
     } else if (stringmap.key().find("prepacked", 0) == 0) {
-      // Starts with 'prepacked'. Each prepacked entry may have multiple blobs with the same key
-      //  we output them with the same key
+      // Starts with 'prepacked', each has its own key.
+      // Each prepacked entry may have multiple blobs with the same key
+      // we output them with the same key
       // format = key|offset;length;checksum[|offset;length;checksum]
       // We are ignoring invalid entries (should not be any), and rely
       // on in memory pre-packs regenerated in this case.
@@ -114,17 +116,38 @@ void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& e
   length->set_value(std::to_string(tensor_bytes_size));
 }
 
-// void ExternalDataInfo::AddPrepackedEntriesToProto(
-//     const PrepackedForSerialization::BlobsInderect& prepacked_for_write, ::ONNX_NAMESPACE::TensorProto& proto) {
-//   size_t prepack_count = 0;
-//   std::stringstream os;
-//   for (auto iter : prepacked_for_write) {
-//     const auto& [key, prepacked_weights] = *iter;
-//     os << key << '|';
-//     const size_t blob_num = prepacked_weights.buffers_.size();
-//     for (size_t i = 0; blob_num; ++i) {
-//       //XXX: Need offset calculation
-//       // os << ed_weights.blobs_[i].offset << ';';
-//     }
-//   }
+std::ostream& ExternalDataInfo::AddPrepackedEntriesToProto(
+    const PrepackedForSerialization::BlobsInderect& prepacked_for_write, bool align, int64_t allocation_granularity,
+    std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) {
+  for (const auto& iter : prepacked_for_write) {
+    size_t prepack_count = 0;
+    const auto& [key, prepacked_weights] = *iter;
+    std::stringstream prepacked_entry;
+    prepacked_entry << key << "|";
+    for (size_t i = 0, size = prepacked_weights.buffers_.size(); i < size; ++i) {
+      if (align) {
+        // return early on error
+        if (!AlignAndPad(os, allocation_granularity, external_offset)) {
+          return os;
+        }
+      }
+      const auto size_in_bytes = prepacked_weights.buffer_sizes_[i];
+      if (prepack_count++ > 0) {
+        prepacked_entry << "|";
+      }
+      // Checksum is currently not validated
+      prepacked_entry << external_offset << ";" << size_in_bytes << ";0";
+      if (!os.write(reinterpret_cast<const char*>(prepacked_weights.buffers_[i].get()), size_in_bytes)) {
+        return os;
+      }
+      external_offset = SafeInt<int64_t>(external_offset) + size_in_bytes;
+    }
+    auto* prepacked = proto.add_external_data();
+    std::string prepacked_key("prepacked_");
+    prepacked_key.append(std::to_string(prepack_count));
+    prepacked->set_key(std::move(prepacked_key));
+    prepacked->set_value(prepacked_entry.str());
+  }
+  return os;
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include <filesystem>
+#include <ostream>
 #include <string>
 #include <tuple>
 
@@ -39,8 +40,33 @@ class ExternalDataInfo {
                                          size_t tensor_bytes_size,
                                          ::ONNX_NAMESPACE::TensorProto& proto);
 
-  static void AddPrepackedEntriesToProto(const PrepackedForSerialization::BlobsInderect& prepacked_for_write,
-                                         ::ONNX_NAMESPACE::TensorProto& proto);
+  // Pads the output with zeros according to the specified allocation_granularity
+  // It updates external_offset for alignment.
+  // need to do padding before write actual tensor data as we do offset alignment at the begin of
+  // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
+  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
+  // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+  static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) {
+    // Align to the larger of the page size or the allocation granularity
+    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), allocation_granularity);
+    // Align to the next page or alloc granularity boundary
+    int64_t new_external_offset = static_cast<int64_t>(
+                                      std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
+                                  alignment_factor;
+
+    // padding tensor with zeros for alignment
+    for (int64_t index = external_offset; index != new_external_offset; ++index) {
+      stream << '\0';
+    }
+    external_offset = new_external_offset;
+    return stream;
+  }
+
+  static std::ostream& AddPrepackedEntriesToProto(const PrepackedForSerialization::BlobsInderect& prepacked_for_write,
+                                                  bool align, int64_t allocation_granularity,
+                                                  std::ostream& os,
+                                                  int64_t& external_offset,
+                                                  ::ONNX_NAMESPACE::TensorProto& proto);
 
   using PrepackedInfo = std::tuple<OFFSET_TYPE, size_t, std::string>;
   using PrepackedInfos = std::unordered_map<std::string, std::vector<PrepackedInfo>>;

diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -1057,6 +1057,8 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
     if (prepacked_info != nullptr && !prepacked_infos->empty()) {
       for (const auto& [key, blobs] : *prepacked_infos) {
         PrePackedWeights prepacked_weights;
+        prepacked_weights.buffers_.reserve(blobs.size());
+        prepacked_weights.buffer_sizes_.reserve(blobs.size());
         for (const auto& blob : blobs) {
           const auto blob_offset = std::get<0>(blob);
           const auto blob_length = std::get<1>(blob);
@@ -1074,7 +1076,7 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
           prepacked_weights.buffer_sizes_.push_back(blob_length);
         }
         if (!blobs.empty()) {
-          prepacked_info->Insert(key, std::move(prepacked_weights));
+          prepacked_info->InsertFromDisk(key, std::move(prepacked_weights));
         }
       }
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,4 +41,4 @@ struct ModelSavingOptions { @@
       const PrepackedForSerialization* prepacked_for_save = nullptr;
     };
-    }
+    }  // namespace onnxruntime