From 0584f09fec98766c54a53391866cfc2e5b0258a6 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <dmitrism@microsoft.com>
Date: Wed, 4 Dec 2024 16:49:04 -0800
Subject: [PATCH] Finished serialization.   Certain kernels do not share their
 pre-packed weights, they simply   keep them to themselves. TODO: Make them
 share.

---
 include/onnxruntime/core/graph/graph.h        |  14 +-
 .../core/graph/model_saving_options.h         |   2 +-
 .../framework/prepacked_weights_container.cc  |  34 ++--
 .../framework/prepacked_weights_container.h   |  26 ++-
 onnxruntime/core/framework/session_state.cc   |  27 ++-
 .../framework/tensor_external_data_info.cc    |  53 ++++--
 .../framework/tensor_external_data_info.h     |  30 +++-
 .../core/framework/tensorprotoutils.cc        |   4 +-
 onnxruntime/core/graph/graph.cc               | 159 +++++-------------
 9 files changed, 176 insertions(+), 173 deletions(-)

diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 7e0d74eb343d0..cdc2875660d7b 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1489,12 +1489,14 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitiallizersImpl(const std::filesystem::path& model_path,
-                                                                       const std::filesystem::path& external_file_path,
-                                                                       const ModelSavingOptions& model_saving_options,
-                                                                       ONNX_NAMESPACE::GraphProto& graph_proto,
-                                                                       std::ostream& external_stream,
-                                                                       int64_t& external_offset) const;
+  Status ToGraphProtoWithExternalInitiallizersImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& modified_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      ONNX_NAMESPACE::GraphProto& graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
 
 #endif
 
diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
index d4ed2d0668f87..2df67e625a55f 100644
--- a/include/onnxruntime/core/graph/model_saving_options.h
+++ b/include/onnxruntime/core/graph/model_saving_options.h
@@ -41,4 +41,4 @@ struct ModelSavingOptions {
   const PrepackedForSerialization* prepacked_for_save = nullptr;
 };
 
-}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc
index 3be398c9d54fc..bc9ae2a5873af 100644
--- a/onnxruntime/core/framework/prepacked_weights_container.cc
+++ b/onnxruntime/core/framework/prepacked_weights_container.cc
@@ -56,18 +56,21 @@ PrepackedForSerialization::PrepackedForSerialization()
 
 PrepackedForSerialization::~PrepackedForSerialization() = default;
 
-void PrepackedForSerialization::Subgraph::Insert(std::string key, PrePackedWeights&& packed_weight) {
+void PrepackedForSerialization::Subgraph::InsertFromDisk(std::string key, PrePackedWeights&& packed_weight) {
   auto result = key_to_blobs_.emplace(std::move(key), std::move(packed_weight));
   ORT_ENFORCE(result.second, "Duplicate pre-packed weight from disk");
 }
 
-bool PrepackedForSerialization::Subgraph::CreateOrOverWrite(const std::string& weight_name, std::string key,
-                                                            PrePackedWeights&& packed_weight) {
-  // We overwrite the existing key. This is necessary in case we already have a pre-packed weight
-  // mapped from disk, but we want to overwrite it with our most recent pre-packed version.
-  auto result = key_to_blobs_.insert_or_assign(std::move(key), std::move(packed_weight));
-  weight_to_pre_packs_[weight_name].push_back(result.first);
-  return result.second;
+bool PrepackedForSerialization::Subgraph::WritePackedForSaving(const std::string& weight_name, const std::string& key,
+                                                               PrePackedWeights&& packed_weight) {
+  auto hit = key_to_blobs_.find(key);
+  if (hit == key_to_blobs_.end()) {
+    auto result = key_to_blobs_.insert({key, std::move(packed_weight)});
+    sorted_by_weight_for_writing_[weight_name].push_back(result.first);
+    return true;
+  }
+  hit->second = std::move(packed_weight);
+  return false;
 }
 
 const PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) const {
@@ -96,12 +99,23 @@ std::optional<PrePackedWeights> PrepackedForSerialization::TakePrepackedWeights(
   return result;
 }
 
-PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreateSubgraph(const Graph& graph) {
+PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreatePrepackedGraph(const Graph& graph) {
   if (graph.ParentGraph() == nullptr) {
     return main_graph_;
   }
-  auto& parent = FindOrCreateSubgraph(*graph.ParentGraph());
+  auto& parent = FindOrCreatePrepackedGraph(*graph.ParentGraph());
   return parent.GetOrCreateSubgraph(graph);
 }
 
+const PrepackedForSerialization::Subgraph* PrepackedForSerialization::FindPrepackedGraph(const Graph& graph) const {
+  if (graph.ParentGraph() == nullptr) {
+    return &main_graph_;
+  }
+  auto* parent = FindPrepackedGraph(*graph.ParentGraph());
+  if (parent != nullptr) {
+    parent = parent->GetSubgraph(graph);
+  }
+  return parent;
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h
index c6fc9a209edb4..a072a0bdc04c5 100644
--- a/onnxruntime/core/framework/prepacked_weights_container.h
+++ b/onnxruntime/core/framework/prepacked_weights_container.h
@@ -91,8 +91,8 @@ class PrepackedForSerialization final {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedForSerialization);
 
   using KeyToBlobMap = std::unordered_map<std::string, PrePackedWeights>;
-  using KeyToBlobMapIterator = KeyToBlobMap::iterator;
-  using BlobsInderect = std::vector<KeyToBlobMapIterator>;
+  using KeyToBlobMapConstIterator = KeyToBlobMap::const_iterator;
+  using BlobsInderect = std::vector<KeyToBlobMapConstIterator>;
   using BlobsConstIterator = BlobsInderect::const_iterator;
 
   // Maps weight name to iterators in key_to_blobs_. It associates a weight name with its pre-packs.
@@ -130,11 +130,10 @@ class PrepackedForSerialization final {
       return it == subgraph_prepacks_.end() ? nullptr : it->second.get();
     }
 
-    // This does not populate per-initializer structures.
-    void Insert(std::string key, PrePackedWeights&& packed_weight);
+    void InsertFromDisk(std::string key, PrePackedWeights&& packed_weight);
 
-    bool CreateOrOverWrite(const std::string& weight_name, std::string key,
-                           PrePackedWeights&& packed_weight);
+    bool WritePackedForSaving(const std::string& weight_name, const std::string& key,
+                              PrePackedWeights&& packed_weight);
 
     const PrePackedWeights* GetPrepackedWeights(const std::string& key) const;
 
@@ -148,11 +147,20 @@ class PrepackedForSerialization final {
       save_mode_on_ = value;
     }
 
+    // Returns iterators to key->blob pair for writing
+    const BlobsInderect* GetBlobsForWeight(const std::string& weight_name) const {
+      auto hit = sorted_by_weight_for_writing_.find(weight_name);
+      if (hit != sorted_by_weight_for_writing_.end()) {
+        return &hit->second;
+      }
+      return nullptr;
+    }
+
    private:
     bool save_mode_on_;
     Subgraph* parent_ = nullptr;
     KeyToBlobMap& key_to_blobs_;
-    WeightToPrePacksMap weight_to_pre_packs_;
+    WeightToPrePacksMap sorted_by_weight_for_writing_;
     // Map Graph ptr to subgraphs
     std::unordered_map<const Graph*, std::unique_ptr<Subgraph>> subgraph_prepacks_;
   };
@@ -179,7 +187,9 @@ class PrepackedForSerialization final {
 
   std::optional<PrePackedWeights> TakePrepackedWeights(const std::string& key);
 
-  Subgraph& FindOrCreateSubgraph(const Graph& graph);
+  Subgraph& FindOrCreatePrepackedGraph(const Graph& graph);
+
+  const Subgraph* FindPrepackedGraph(const Graph& graph) const;
 
  private:
   // Map of key to pre-packed blobs.This is common for all subgraphs
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index e581553e2208a..f1974c9576ad4 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -387,19 +387,18 @@ static Status KernelUseSharedPrePackedBuffers(OpKernel& kernel, int input_idx,
   return Status::OK();
 }
 
-// Here we use the data that is owned by somebody else
-static void SavePrepackedDataForWriting(const std::string& weight_name,
-                                        const std::string& key,
-                                        const PrePackedWeights& prepacked_weights,
-                                        PrepackedForSerialization::Subgraph& prepacked_subgraph) {
+static void WritePrepackedForSaving(const std::string& weight_name,
+                                    const std::string& key,
+                                    const PrePackedWeights& prepacked_weights,
+                                    PrepackedForSerialization::Subgraph& prepacked_subgraph) {
   PrePackedWeights weights_for_saving;
   for (const auto& prepacked_buffer : prepacked_weights.buffers_) {
-    // BufferDeleter is nullptr because we do not own the data
+    // BufferDeleter is nullptr because we do not own the data in this case
     weights_for_saving.buffers_.emplace_back(prepacked_buffer.get(), BufferDeleter(nullptr));
   }
 
   weights_for_saving.buffer_sizes_ = prepacked_weights.buffer_sizes_;
-  prepacked_subgraph.CreateOrOverWrite(weight_name, key, std::move(weights_for_saving));
+  prepacked_subgraph.WritePackedForSaving(weight_name, key, std::move(weights_for_saving));
 }
 
 static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type,
@@ -417,7 +416,7 @@ Status SessionState::PrepackConstantInitializedTensors(
     const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
   auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map](
                                         bool should_cache_prepacked_weights_for_shared_initializers) -> Status {
-    auto& prepacked_subgraph = prepacked_weights_for_serialization_.FindOrCreateSubgraph(graph_);
+    auto& prepacked_subgraph = prepacked_weights_for_serialization_.FindOrCreatePrepackedGraph(graph_);
 
     for (auto& node : GetGraphViewer().Nodes()) {
       auto kernel = GetMutableKernel(node.Index());
@@ -492,8 +491,8 @@ Status SessionState::PrepackConstantInitializedTensors(
                       if (prepacked_weights_for_serialization_.IsSaveModeOn()) {
                         // Here we take references to the shared container owned data, so we unmap any entries
                         // that we are mapping from disk
-                        SavePrepackedDataForWriting(input_name, prepacked_weights_container_key, prepacked_shared,
-                                                    prepacked_subgraph);
+                        WritePrepackedForSaving(input_name, prepacked_weights_container_key, prepacked_shared,
+                                                prepacked_subgraph);
                       }
 
                     } else {  // container doesn't contain the pre-packed weight - so write into it for sharing across kernel instances
@@ -523,8 +522,8 @@ Status SessionState::PrepackConstantInitializedTensors(
                       if (prepacked_weights_for_serialization_.IsSaveModeOn()) {
                         // Here we take references to the shared container owned data, so we unmap any entries
                         // that we are mapping from disk, so we write the most fresh data possible
-                        SavePrepackedDataForWriting(input_name, prepacked_weights_container_key, shared_prepacked,
-                                                    prepacked_subgraph);
+                        WritePrepackedForSaving(input_name, prepacked_weights_container_key, shared_prepacked,
+                                                prepacked_subgraph);
                       }
                     }
                   }
@@ -554,8 +553,8 @@ Status SessionState::PrepackConstantInitializedTensors(
 
                     if (prepacked_subgraph.IsSaveModeOn() || weights_to_use == nullptr) {
                       // In this case pre-packed container owns the data
-                      prepacked_subgraph.CreateOrOverWrite(input_name, prepacked_weights_container_key,
-                                                           std::move(weights_to_be_filled_in));
+                      prepacked_subgraph.WritePackedForSaving(input_name, prepacked_weights_container_key,
+                                                              std::move(weights_to_be_filled_in));
                       weights_to_use = prepacked_subgraph.GetPrepackedWeights(prepacked_weights_container_key);
                       assert(weights_to_use != nullptr);
                     }
diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc
index 4e6e2a4a82a17..c4f09bdba6256 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.cc
+++ b/onnxruntime/core/framework/tensor_external_data_info.cc
@@ -4,6 +4,7 @@
 #include "tensor_external_data_info.h"
 #include "core/common/common.h"
 #include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/common/string_utils.h"
 #include "core/platform/path_lib.h"
 
@@ -54,8 +55,9 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
     } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) {
       out->checksum_ = stringmap.value();
     } else if (stringmap.key().find("prepacked", 0) == 0) {
-      // Starts with 'prepacked'. Each prepacked entry may have multiple blobs with the same key
-      //  we output them with the same key
+      // Starts with 'prepacked', each has its own key.
+      // Each prepacked entry may have multiple blobs with the same key
+      // we output them with the same key
       // format = key|offset;length;checksum[|offset;length;checksum]
       // We are ignoring invalid entries (should not be any), and rely
       // on in memory pre-packs regenerated in this case.
@@ -114,17 +116,38 @@ void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& e
   length->set_value(std::to_string(tensor_bytes_size));
 }
 
-// void ExternalDataInfo::AddPrepackedEntriesToProto(
-//     const PrepackedForSerialization::BlobsInderect& prepacked_for_write, ::ONNX_NAMESPACE::TensorProto& proto) {
-//   size_t prepack_count = 0;
-//   std::stringstream os;
-//   for (auto iter : prepacked_for_write) {
-//     const auto& [key, prepacked_weights] = *iter;
-//     os << key << '|';
-//     const size_t blob_num = prepacked_weights.buffers_.size();
-//     for (size_t i = 0; blob_num; ++i) {
-//       //XXX: Need offset calculation
-//       // os << ed_weights.blobs_[i].offset << ';';
-//     }
-//   }
+std::ostream& ExternalDataInfo::AddPrepackedEntriesToProto(
+    const PrepackedForSerialization::BlobsInderect& prepacked_for_write, bool align, int64_t allocation_granularity,
+    std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) {
+  for (const auto& iter : prepacked_for_write) {
+    size_t prepack_count = 0;
+    const auto& [key, prepacked_weights] = *iter;
+    std::stringstream prepacked_entry;
+    prepacked_entry << key << "|";
+    for (size_t i = 0, size = prepacked_weights.buffers_.size(); i < size; ++i) {
+      if (align) {
+        // return early on error
+        if (!AlignAndPad(os, allocation_granularity, external_offset)) {
+          return os;
+        }
+      }
+      const auto size_in_bytes = prepacked_weights.buffer_sizes_[i];
+      if (prepack_count++ > 0) {
+        prepacked_entry << "|";
+      }
+      // Checksum is currently not validated
+      prepacked_entry << external_offset << ";" << size_in_bytes << ";0";
+      if (!os.write(reinterpret_cast<const char*>(prepacked_weights.buffers_[i].get()), size_in_bytes)) {
+        return os;
+      }
+      external_offset = SafeInt<int64_t>(external_offset) + size_in_bytes;
+    }
+    auto* prepacked = proto.add_external_data();
+    std::string prepacked_key("prepacked_");
+    prepacked_key.append(std::to_string(prepack_count));
+    prepacked->set_key(std::move(prepacked_key));
+    prepacked->set_value(prepacked_entry.str());
+  }
+  return os;
+}
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h
index 853c14338a2f8..3c65fd51815d7 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.h
+++ b/onnxruntime/core/framework/tensor_external_data_info.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include <filesystem>
+#include <ostream>
 #include <string>
 #include <tuple>
 
@@ -39,8 +40,33 @@ class ExternalDataInfo {
                                          size_t tensor_bytes_size,
                                          ::ONNX_NAMESPACE::TensorProto& proto);
 
-  static void AddPrepackedEntriesToProto(const PrepackedForSerialization::BlobsInderect& prepacked_for_write,
-                                         ::ONNX_NAMESPACE::TensorProto& proto);
+  // Pads the output with zeros according to the specified allocation_granularity
+  // It updates external_offset for alignment.
+  // need to do padding before write actual tensor data as we do offset alignment at the begin of
+  // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
+  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
+  // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+  static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) {
+    // Align to the larger of the page size or the allocation granularity
+    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), allocation_granularity);
+    // Align to the next page or alloc granularity boundary
+    int64_t new_external_offset = static_cast<int64_t>(
+                                      std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
+                                  alignment_factor;
+
+    // padding tensor with zeros for alignment
+    for (int64_t index = external_offset; index != new_external_offset; ++index) {
+      stream << '\0';
+    }
+    external_offset = new_external_offset;
+    return stream;
+  }
+
+  static std::ostream& AddPrepackedEntriesToProto(const PrepackedForSerialization::BlobsInderect& prepacked_for_write,
+                                                  bool align, int64_t allocation_granularity,
+                                                  std::ostream& os,
+                                                  int64_t& external_offset,
+                                                  ::ONNX_NAMESPACE::TensorProto& proto);
 
   using PrepackedInfo = std::tuple<OFFSET_TYPE, size_t, std::string>;
   using PrepackedInfos = std::unordered_map<std::string, std::vector<PrepackedInfo>>;
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 6d1f9d631d0aa..0ce6e4360db5f 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -1057,6 +1057,8 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
     if (prepacked_info != nullptr && !prepacked_infos->empty()) {
       for (const auto& [key, blobs] : *prepacked_infos) {
         PrePackedWeights prepacked_weights;
+        prepacked_weights.buffers_.reserve(blobs.size());
+        prepacked_weights.buffer_sizes_.reserve(blobs.size());
         for (const auto& blob : blobs) {
           const auto blob_offset = std::get<0>(blob);
           const auto blob_length = std::get<1>(blob);
@@ -1074,7 +1076,7 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
           prepacked_weights.buffer_sizes_.push_back(blob_length);
         }
         if (!blobs.empty()) {
-          prepacked_info->Insert(key, std::move(prepacked_weights));
+          prepacked_info->InsertFromDisk(key, std::move(prepacked_weights));
         }
       }
     }
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index b7353ca3875bf..a54c0b421b8f8 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -4086,59 +4086,51 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const {
   return result;
 }
 
-// Create a recursive function that does bottom up with subgraphs
-ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitiallizersImpl(
+// A recursive function that does bottom up with subgraphs
+Status Graph::ToGraphProtoWithExternalInitiallizersImpl(
     const std::filesystem::path& model_path,
     const std::filesystem::path& external_file_path,
+    const std::filesystem::path& modified_external_file_path,
     const ModelSavingOptions& model_saving_options,
     ONNX_NAMESPACE::GraphProto& output_graph_proto,
     std::ostream& external_stream,
     int64_t& external_offset) const {
-  // update external_offset for alignment
-  // need to do padding before write actual tensor data as we do offset alignment at the begin of
-  // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
-  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-  // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-  auto compute_and_pad = [&external_stream](int64_t allocation_granularity, int64_t& external_offset) {
-    // Align to the larger of the page size or the allocation granularity
-    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), allocation_granularity);
-    // Align to the next page or alloc granularity boundary
-    int64_t new_external_offset = static_cast<int64_t>(
-                                      std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
-                                  alignment_factor;
-
-    // padding tensor with zeros for alignment
-    for (int64_t index = external_offset; index != new_external_offset; ++index) {
-      external_stream << '\0';
-    }
-    external_offset = new_external_offset;
-  };
-
   // Process subgraphs
   for (const auto& node : Nodes()) {
     if (node.ContainsSubgraph()) {
       // Let find this node in the output_graph_proto
-      auto hit = std::find_if(output_graph_proto.node().begin(),
-                              output_graph_proto.node().end(),
+      auto hit = std::find_if(output_graph_proto.mutable_node()->begin(),
+                              output_graph_proto.mutable_node()->end(),
                               [&node](const ONNX_NAMESPACE::NodeProto& proto) {
                                 return proto.name() == node.Name();
                               });
-      ORT_ENFORCE(hit != output_graph_proto.node().end(), "Node ", node.Name(),
-                  " not found in output_graph_proto");
+      ORT_RETURN_IF_NOT(hit != output_graph_proto.mutable_node()->end(), "Node ", node.Name(),
+                        " not found in output_graph_proto");
       auto& result_node = *hit;
       for (const auto& [name, subgraph] : node.GetAttributeNameToSubgraphMap()) {
         // Lets find this subgraph in the result_node
-        auto sub_hit = std::find_if(result_node.attribute().begin(),
-                                    result_node.attribute().end(),
+        auto sub_hit = std::find_if(result_node.mutable_attribute()->begin(),
+                                    result_node.mutable_attribute()->end(),
                                     [&name](const ONNX_NAMESPACE::AttributeProto& proto) {
                                       return proto.name() == name;
                                     });
-        ORT_ENFORCE(sub_hit != result_node.attribute().end(), "Subgraph ", name,
-                    " not found in node ", node.Name());
+        ORT_RETURN_IF_NOT(sub_hit != result_node.mutable_attribute()->end() && utils::HasGraph(*sub_hit),
+                          "Subgraph ", name, " not found in node ", node.Name());
+        auto& result_subgraph = *sub_hit->mutable_g();
+        ORT_RETURN_IF_ERROR(subgraph->ToGraphProtoWithExternalInitiallizersImpl(
+            model_path, external_file_path,
+            modified_external_file_path, model_saving_options,
+            result_subgraph, external_stream, external_offset));
       }
     }
   }
 
+  const PrepackedForSerialization::Subgraph* prepacked_parent_graph = nullptr;
+  if (model_saving_options.prepacked_for_save != nullptr) {
+    // Is there any pre-packed weights for this subgraph?
+    prepacked_parent_graph = model_saving_options.prepacked_for_save->FindPrepackedGraph(*this);
+  }
+
   // Add the initializers to the result graph.
   for (const auto& initializer : graph_proto_->initializer()) {
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -4146,14 +4138,14 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitiallizersImpl(
       // Sparse tensors are added to the ONNX file.
       auto& sparse_initializer = *output_graph_proto.add_sparse_initializer();
       auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer);
-      ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
+      ORT_RETURN_IF_NOT(status.IsOK(), "Failed to convert dense initializer to sparse");
     } else {
 #endif
       // Dense tensors larger than the threshold are added to the external file.
       TensorProto* output_proto = output_graph_proto.add_initializer();
 
       std::vector<uint8_t> raw_data;
-      ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
+      ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
       size_t tensor_bytes_size = raw_data.size();
       if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
         *output_proto = initializer;
@@ -4164,15 +4156,16 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitiallizersImpl(
       // need to do padding before write actual tensor data as we do offset alignment at the begin of
       // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
       // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-      // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+      // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
       if (model_saving_options.align_offset && static_cast<int64_t>(tensor_bytes_size) >
                                                    model_saving_options.align_threshold) {
-        compute_and_pad(model_saving_options.allocation_granularity, external_offset);
+        ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.allocation_granularity,
+                                                        external_offset),
+                          "Failed writing external data to: ", modified_external_file_path);
       }
 
-      if (!external_stream.write(reinterpret_cast<const char*>(raw_data.data()), tensor_bytes_size)) {
-        ORT_THROW("Failed to write external initializers to file: ", modified_external_file_path);
-      }
+      ORT_RETURN_IF_NOT(external_stream.write(reinterpret_cast<const char*>(raw_data.data()), tensor_bytes_size),
+                        "Failed to write external initializers to file: ", modified_external_file_path);
 
       ExternalDataInfo::SetExternalLocationToProto(external_file_path, external_offset,
                                                    tensor_bytes_size, *output_proto);
@@ -4186,15 +4179,21 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitiallizersImpl(
 
       external_offset += tensor_bytes_size;
 
-      const PrepackedForSerialization::Subgraph* prepacked_subgraph = nullptr;
-      if (model_saving_options.prepacked_for_save != nullptr) {
-        prepacked_subgraph = *model_saving_options.prepacked_for_save->FindOrCreateSubgraph(*this);
+      if (prepacked_parent_graph != nullptr) {
+        const auto* iters_to_blobs = prepacked_parent_graph->GetBlobsForWeight(initializer.name());
+        if (iters_to_blobs != nullptr && !iters_to_blobs->empty()) {
+          ORT_RETURN_IF_NOT(ExternalDataInfo::AddPrepackedEntriesToProto(
+              *iters_to_blobs, model_saving_options.align_offset,
+              model_saving_options.allocation_granularity,
+              external_stream, external_offset, *output_proto));
+        }
       }
 
 #if !defined(DISABLE_SPARSE_TENSORS)
     }
 #endif
   }
+  return Status::OK();
 }
 
 ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
@@ -4211,84 +4210,12 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
 
   // Create the external file.
   std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
-  ORT_ENFORCE(external_stream.is_open());
+  ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
   int64_t external_offset = 0;
 
-  // update external_offset for alignment
-  // need to do padding before write actual tensor data as we do offset alignment at the begin of
-  // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
-  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-  // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-  auto compute_and_pad = [&external_stream](int64_t allocation_granularity, int64_t& external_offset) {
-    // Align to the larger of the page size or the allocation granularity
-    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), allocation_granularity);
-    // Align to the next page or alloc granularity boundary
-    int64_t new_external_offset = static_cast<int64_t>(
-                                      std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
-                                  alignment_factor;
-
-    // padding tensor with zeros for alignment
-    for (int64_t index = external_offset; index != new_external_offset; ++index) {
-      external_stream << '\0';
-    }
-    external_offset = new_external_offset;
-  };
-
-  // Add the initializers to the result graph.
-#if !defined(DISABLE_SPARSE_TENSORS)
-  const auto sparse_end = sparse_tensor_names_.end();
-#endif
-
-  for (const auto& initializer : graph_proto_->initializer()) {
-#if !defined(DISABLE_SPARSE_TENSORS)
-    if (sparse_end != sparse_tensor_names_.find(initializer.name())) {
-      // Sparse tensors are added to the ONNX file.
-      auto& sparse_initializer = *result.add_sparse_initializer();
-      auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer);
-      ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
-    } else {
-#endif
-      // Dense tensors larger than the threshold are added to the external file.
-      TensorProto* output_proto = result.add_initializer();
-
-      std::vector<uint8_t> raw_data;
-      ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
-      size_t tensor_bytes_size = raw_data.size();
-      if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
-        *output_proto = initializer;
-        continue;
-      }
-
-      // update external_offset for alignment
-      // need to do padding before write actual tensor data as we do offset alignment at the begin of
-      // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
-      // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-      // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-      if (model_saving_options.align_offset && static_cast<int64_t>(tensor_bytes_size) >
-                                                   model_saving_options.align_threshold) {
-        compute_and_pad(model_saving_options.allocation_granularity, external_offset);
-      }
-
-      if (!external_stream.write(reinterpret_cast<const char*>(raw_data.data()), tensor_bytes_size)) {
-        ORT_THROW("Failed to write external initializers to file: ", modified_external_file_path);
-      }
-
-      ExternalDataInfo::SetExternalLocationToProto(external_file_path, external_offset,
-                                                   tensor_bytes_size, *output_proto);
-
-      output_proto->set_name(initializer.name());
-      output_proto->set_data_type(initializer.data_type());
-      for (int i = 0; i != initializer.dims_size(); ++i) {
-        output_proto->add_dims(initializer.dims(i));
-      }
-      output_proto->set_doc_string(initializer.doc_string());
-
-      external_offset += tensor_bytes_size;
-
-#if !defined(DISABLE_SPARSE_TENSORS)
-    }
-#endif
-  }
+  ORT_THROW_IF_ERROR(ToGraphProtoWithExternalInitiallizersImpl(model_path, external_file_path,
+                                                               modified_external_file_path, model_saving_options,
+                                                               result, external_stream, external_offset));
 
   if (!external_stream.flush()) {
     ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);