diff --git a/include/onnxruntime/core/framework/buffer_deleter.h b/include/onnxruntime/core/framework/buffer_deleter.h
index 961eb443ee1c7..684cab4a722bc 100644
--- a/include/onnxruntime/core/framework/buffer_deleter.h
+++ b/include/onnxruntime/core/framework/buffer_deleter.h
@@ -5,6 +5,8 @@
 
 #include "core/framework/allocator.h"
 
+#include <functional>
+
 namespace onnxruntime {
 
 // TODO: Do we need this class or is IAllocator::MakeUniquePtr sufficient/better
@@ -31,6 +33,6 @@ class BufferDeleter {
   AllocatorPtr alloc_{nullptr};
 };
 
-using BufferUniquePtr = std::unique_ptr<void, BufferDeleter>;
+using BufferUniquePtr = std::unique_ptr<void, std::function<void(void*)>>;
 using BufferNakedPtr = void*;
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index eb9581e8018d1..4764c047e21db 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -19,10 +19,10 @@
 #include "core/common/common.h"
 #include "core/common/path_string.h"
 #include "core/common/const_pointer_container.h"
+#include "core/common/inlined_containers_fwd.h"
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/common/inlined_containers.h"
 #endif
-#include "core/common/inlined_containers_fwd.h"
 #include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
@@ -41,6 +41,7 @@ namespace onnxruntime {
 class Graph;
 struct IndexedSubGraph;
 class Model;
+struct ModelSavingOptions;
 class OpSignature;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1153,29 +1154,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
   ONNX_NAMESPACE::GraphProto ToGraphProto() const;
 
-  // Options to align external initializer offset.
-  // For models running on CPU, ORT will try to use mmap to load external initializers.
-  // To use mmap, external initializer need to be offset aligned.
-  // ORT saves external initializers into signle data file, each initializer is accessed with
-  // offset(start position of initializer) and length(byte length of initializer) of the data file.
-  // To use mmap, each offset need to be aligned which means offset need to divisible by
-  // allocation granularity(64KB for windows and 4K for other OSes).
-  // With align_offset to true, ORT will align offset for large initializer when
-  // save ONNX model with external data file.
-  struct OffsetAlignmentInfo {
-    // Offset will always be page aligned and allocation granularity aligned for mmap support.
-    // This is done by padding previous tensor data with zeros keeping same length.
-    bool align_offset = false;
-    // Alignment threshold for size of data.
-    // Having a low threshold will waste file space for small initializers.
-    // Only when tensor's data size is > the page_align_threshold it will be force aligned.
-    // Default to 1MB.
-    int64_t align_threshold = 1048576;
-    // The allocation Granularity for mmap() support.
-    // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
-    int64_t allocation_granularity = 65536;
-  };
-
   /** Gets the GraphProto representation of this Graph
   @param external_file_path File path of the binary file to use for initializers.
   @param model_file_path path of the model file.
@@ -1186,15 +1164,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                  const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold) const {
-    OffsetAlignmentInfo default_options;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
@@ -1519,6 +1489,28 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
+  /// <summary>
+  /// A map that is used to keep track of pre-packed blobs to be serialized
+  /// The implementation adds pre-packed external data references to the TensorProto
+  /// that contains the initializer data. However, it may be an outerscope initializer.
+  /// Thus we need to keep track of the pre-packed blobs that are not serialized in this
+  /// graph, so the parent can make sure it is being serialized.
+  ///
+  /// The below map has <weight_name, std::vector<blob_key_name>>. This contains
+  /// the entries that are not serialized in this graph, and the parent must check in them
+  /// </summary>
+  using WeightToPrePacksMap = NodeHashMap<std::string, InlinedHashSet<std::string>>;
+
+  Status ToGraphProtoWithExternalInitiallizersImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& modified_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      WeightToPrePacksMap& unprocessed_prepacks,
+      ONNX_NAMESPACE::GraphProto& graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
+
 #endif
 
   Version IrVersion() const noexcept {
diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
new file mode 100644
index 0000000000000..2df67e625a55f
--- /dev/null
+++ b/include/onnxruntime/core/graph/model_saving_options.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+class PrepackedForSerialization;
+
+// These options that affect how the model initializers are saved.
+// This includes options to align external initializer offset.
+// For models running on CPU, ORT will try to use mmap to load external
+// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT saves external initializers into signle data file, each initializer is
+// accessed with offset(start position of initializer) and length(byte length of
+// initializer) of the data file. To use mmap, each offset need to be aligned
+// which means offset need to divisible by allocation granularity(64KB for
+// windows and 4K for other OSes). With align_offset to true, ORT will align
+// offset for large initializer when save ONNX model with external data file.
+struct ModelSavingOptions {
+  explicit ModelSavingOptions(size_t size_threshold)
+      : initializer_size_threshold(size_threshold) {}
+
+  // Mimimal initializer size in bytes to be externalized on disk
+  size_t initializer_size_threshold;
+  // Offset will always be page aligned and allocation granularity aligned for
+  // mmap support. This is done by padding previous tensor data with zeros
+  // keeping same length.
+  bool align_offset = false;
+  // Alignment threshold for size of data.
+  // Having a low threshold will waste file space for small initializers.
+  // Only when tensor's data size is > the page_align_threshold it will be force
+  // aligned. Default to 1MB.
+  int64_t align_threshold = 1048576;
+  // The allocation Granularity for mmap() support.
+  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+  int64_t allocation_granularity = 65536;
+  // Optional pointer to a container of pre-packed initializers to be
+  // embedded into the external initializers, so they can also be loaded
+  // from disk.
+  const PrepackedForSerialization* prepacked_for_save = nullptr;
+};
+
+}  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 6a01602e634f8..cec3c9ce40b9a 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -250,6 +250,15 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
+// Use this config when save pre-packed constant initializers to an external data file.
+// This allows to minimize ONNX model file size and memory map pre-packed initializers on
+// model load.
+// - "0": Default is not save pre-packed initializers to a data file.
+// - "1": Save pre-packed constant initializers to an external data file.
+// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers,  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_external_prepacked_constant_initializers";
+
 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
diff --git a/onnxruntime/core/framework/prepacked_weights.h b/onnxruntime/core/framework/prepacked_weights.h
index fbf99b81937ee..9a36e9bc48f05 100644
--- a/onnxruntime/core/framework/prepacked_weights.h
+++ b/onnxruntime/core/framework/prepacked_weights.h
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include "core/common/basic_types.h"
+#include "core/common/inlined_containers_fwd.h"
 #include "core/framework/buffer_deleter.h"
 #include "core/framework/tensor_shape.h"
 
@@ -16,11 +17,14 @@ struct PrePackedWeights final {
   // Hence we hold them in container. It is upto the developer implementing each PrePack()
   // method to define what gets stored in which position of the container.
 
-  std::vector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
-  std::vector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
+  InlinedVector<BufferUniquePtr> buffers_;  // cache pre-packed buffers associated with the kernel
+  InlinedVector<size_t> buffer_sizes_;      // cache sizes of pre-packed buffers (in bytes)
 
   // Produces a hash of the buffers stored in the given instance of this class
   HashValue GetHash() const;
+
+  // The function creates a copy with non-owning BufferUniquePtrs.
+  PrePackedWeights CreateReferringCopy() const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc
index b6d44dd248bdd..ffb545dd972a6 100644
--- a/onnxruntime/core/framework/prepacked_weights_container.cc
+++ b/onnxruntime/core/framework/prepacked_weights_container.cc
@@ -3,9 +3,21 @@
 
 #include "core/framework/prepacked_weights_container.h"
 #include "core/framework/allocator_utils.h"
+#include "core/graph/graph.h"
 
 namespace onnxruntime {
 
+PrePackedWeights PrePackedWeights::CreateReferringCopy() const {
+  PrePackedWeights copy;
+  for (const auto& prepacked_buffer : buffers_) {
+    // BufferDeleter is nullptr because we do not own the data in this case
+    copy.buffers_.emplace_back(prepacked_buffer.get(), BufferDeleter(nullptr));
+  }
+
+  copy.buffer_sizes_ = buffer_sizes_;
+  return copy;
+}
+
 AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) {
   auto iter = allocators_.find(device_name);
 
@@ -49,4 +61,87 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const {
   return prepacked_weights_map_.size();
 }
 
+PrepackedForSerialization::PrepackedForSerialization()
+    : main_graph_(nullptr, key_to_blobs_, false) {
+}
+
+PrepackedForSerialization::~PrepackedForSerialization() = default;
+
+void PrepackedForSerialization::Subgraph::InsertFromDisk(const std::string& key, PrePackedWeights&& packed_weight) {
+  // We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and
+  // up the tree by the same kernel with the same result. The map prevents this from happening.
+  key_to_blobs_.emplace(key, std::move(packed_weight));
+}
+
+void PrepackedForSerialization::Subgraph::WritePacked(const std::string& weight_name, const std::string& key,
+                                                      PrePackedWeights&& packed_weight) {
+  auto hit = key_to_blobs_.find(key);
+  if (hit == key_to_blobs_.end()) {
+    // new key
+    key_to_blobs_.emplace(key, std::move(packed_weight));
+    if (save_mode_on_) {
+      sorted_by_weight_for_writing_[weight_name].insert(key);
+    }
+    return;
+  }
+
+  // Key existed, but may or may not have a reference in this subgraph
+  if (save_mode_on_) {
+    auto& list = sorted_by_weight_for_writing_[weight_name];
+    list.insert(key);
+  }
+  hit->second = std::move(packed_weight);
+}
+
+const PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) const {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+std::optional<PrePackedWeights> PrepackedForSerialization::Subgraph::ReplaceWithReferenceIfSaving(
+    const std::string& weight_name,
+    const std::string& key,
+    const PrePackedWeights& refer_if_absent) {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    if (save_mode_on_) {
+      key_to_blobs_.emplace(key, refer_if_absent.CreateReferringCopy());
+      sorted_by_weight_for_writing_[weight_name].insert(key);
+    }
+    return std::nullopt;
+  }
+
+  PrePackedWeights result = std::move(it->second);
+  if (save_mode_on_) {
+    it->second = result.CreateReferringCopy();
+    auto& list = sorted_by_weight_for_writing_[weight_name];
+    list.insert(key);
+  } else {
+    key_to_blobs_.erase(it);
+  }
+  return result;
+}
+
+PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreatePrepackedGraph(const Graph& graph) {
+  if (graph.ParentGraph() == nullptr) {
+    return main_graph_;
+  }
+  auto& parent = FindOrCreatePrepackedGraph(*graph.ParentGraph());
+  return parent.GetOrCreateSubgraph(graph);
+}
+
+const PrepackedForSerialization::Subgraph* PrepackedForSerialization::FindPrepackedGraph(const Graph& graph) const {
+  if (graph.ParentGraph() == nullptr) {
+    return &main_graph_;
+  }
+  auto* parent = FindPrepackedGraph(*graph.ParentGraph());
+  if (parent != nullptr) {
+    parent = parent->GetSubgraph(graph);
+  }
+  return parent;
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h
index 37fc01c05f2ae..a0880239bba5f 100644
--- a/onnxruntime/core/framework/prepacked_weights_container.h
+++ b/onnxruntime/core/framework/prepacked_weights_container.h
@@ -3,19 +3,26 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <unordered_set>
-#include <string>
-#include <cstdint>
-
-#include "core/framework/buffer_deleter.h"
-
+#include "core/common/common.h"
 #include "core/framework/allocator.h"
-#include <mutex>
 #include "prepacked_weights.h"
 
+#include <cstdint>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
 namespace onnxruntime {
 
+#ifndef SHARED_PROVIDER
+class Graph;
+#else
+struct Graph;
+#endif
+
 class PrepackedWeightsContainer final {
  public:
   PrepackedWeightsContainer() {
@@ -66,4 +73,162 @@ class PrepackedWeightsContainer final {
   std::unordered_map<std::string, PrePackedWeights> prepacked_weights_map_;
 };
 
+/// <summary>
+/// This class has a dual purpose.
+/// When saving to disk is ON (IsOverWriteForSave() true)
+/// it provides a storage container for PrePackedWeights instances. The pre-packed
+/// data is collected using PrepackConstaitInitializers(). In this case newly pre-pack
+/// data is used for writing to disk, unless old data matches.
+///
+/// If saving is OFF, it is used to contain the weights memory mapped from disk.
+/// Those weights are then moved to the shared container if weight sharing is enabled.
+/// And also the interested kernels.
+/// </summary>
+class PrepackedForSerialization final {
+ public:
+  explicit PrepackedForSerialization();
+  ~PrepackedForSerialization();
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedForSerialization);
+
+  using KeyToBlobMap = std::unordered_map<std::string, PrePackedWeights>;
+
+  // Maps weight name to iterators in key_to_blobs_. It associates a weight name with its pre-packs.
+  // Normally, a single weight produces a single PrePackedWeights. But it is possible that a weight
+  // is pre-packed by different kernels.
+  using KeysPerWeight = std::unordered_set<std::string>;  // blob keys
+  using WeightToPrePacksMap = std::unordered_map<std::string, KeysPerWeight>;
+
+  class Subgraph {
+   public:
+    Subgraph(Subgraph* par, KeyToBlobMap& key_blobs, bool overwrite_for_save)
+        : save_mode_on_(overwrite_for_save), parent_(par), key_to_blobs_(key_blobs) {
+    }
+
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Subgraph);
+
+    const Subgraph* Parent() const noexcept {
+      return parent_;
+    }
+
+    Subgraph* Parent() noexcept {
+      return parent_;
+    }
+
+    Subgraph& GetOrCreateSubgraph(const Graph& graph) {
+      auto result = subgraph_prepacks_.emplace(&graph, nullptr);
+      if (result.second) {
+        result.first->second = std::make_unique<Subgraph>(this, key_to_blobs_, save_mode_on_);
+      }
+      return *result.first->second;
+    }
+
+    const Subgraph* GetSubgraph(const Graph& graph) const {
+      auto it = subgraph_prepacks_.find(&graph);
+      return it == subgraph_prepacks_.end() ? nullptr : it->second.get();
+    }
+
+    void InsertFromDisk(const std::string& key, PrePackedWeights&& packed_weight);
+
+    void WritePacked(const std::string& weight_name, const std::string& key,
+                     PrePackedWeights&& packed_weight);
+
+    const PrePackedWeights* GetPrepackedWeights(const std::string& key) const;
+
+    // The function would add or replace existing entry with references to it.
+    // If the entry is present, it would replace it with references to the existing entry.
+    // If the entry is not present, it would add reference to refer_if_absent
+    // If present it would return the existing entry otherwise std::nullopt
+    std::optional<PrePackedWeights> ReplaceWithReferenceIfSaving(const std::string& weight_name,
+                                                                 const std::string& key,
+                                                                 const PrePackedWeights& refer_if_absent);
+
+    bool IsSaveModeOn() const noexcept {
+      return save_mode_on_;
+    }
+
+    void SetSaveMode(bool value) noexcept {
+      save_mode_on_ = value;
+    }
+
+    // Returns iterators to key->blob pair for writing
+    const KeysPerWeight* GetBlobsForWeight(const std::string& weight_name) const {
+      auto hit = sorted_by_weight_for_writing_.find(weight_name);
+      if (hit != sorted_by_weight_for_writing_.end()) {
+        return &hit->second;
+      }
+      return nullptr;
+    }
+
+    size_t GetSubgraphNum() const noexcept {
+      return subgraph_prepacks_.size();
+    }
+
+    size_t GetNumberOfWeightsForWriting() const noexcept {
+      return sorted_by_weight_for_writing_.size();
+    }
+
+    size_t GetNumberOfKeyedBlobsForWriting() const noexcept {
+      size_t result = 0;
+      for (const auto& [_, keys] : sorted_by_weight_for_writing_) {
+        result += keys.size();
+      }
+      return result;
+    }
+
+    const WeightToPrePacksMap& GetSortedByWeightForWriting() const noexcept {
+      return sorted_by_weight_for_writing_;
+    }
+
+    // This template declaration is intended to be instantiated
+    // and used by test code to inspect private members. Otherwise,
+    // it has no body and is not intended to be used by other code.
+    template <class T>
+    void TestHarness(T&) const;
+
+   private:
+    bool save_mode_on_ = false;
+    Subgraph* parent_ = nullptr;
+    KeyToBlobMap& key_to_blobs_;
+    WeightToPrePacksMap sorted_by_weight_for_writing_;
+    // Map Graph ptr to subgraphs
+    std::unordered_map<const Graph*, std::unique_ptr<Subgraph>> subgraph_prepacks_;
+  };
+
+  const PrePackedWeights* GetPrepackedForKey(const std::string& key) const {
+    auto it = key_to_blobs_.find(key);
+    return it == key_to_blobs_.end() ? nullptr : &it->second;
+  }
+
+  const Subgraph& MainGraph() const noexcept {
+    return main_graph_;
+  }
+
+  Subgraph& MainGraph() noexcept {
+    return main_graph_;
+  }
+
+  size_t GetNumberOfKeyedBlobs() const noexcept {
+    return key_to_blobs_.size();
+  }
+
+  void SetSaveMode(bool value) noexcept {
+    main_graph_.SetSaveMode(value);
+  }
+
+  bool IsSaveModeOn() const noexcept {
+    return main_graph_.IsSaveModeOn();
+  }
+
+  Subgraph& FindOrCreatePrepackedGraph(const Graph& graph);
+
+  const Subgraph* FindPrepackedGraph(const Graph& graph) const;
+
+ private:
+  // Map of key to pre-packed blobs.This is common for all subgraphs
+  // The key is : op_type + "+" + hash_of_prepacked_buffers_in_the_PrepackedWeights_instance.
+  // as defined above. We store keys for all scopes (main graph and subgraphs)
+  KeyToBlobMap key_to_blobs_;
+  Subgraph main_graph_;
+};
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 0ac2271ba09f1..5edea481ba7f0 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -13,6 +13,7 @@
 #include "core/framework/node_index_info.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/ort_value_pattern_planner.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/session_state_utils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/controlflow/utils.h"
@@ -396,9 +397,12 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type,
   return ss_1.str();
 }
 
-Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                                       const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
-  auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map](
+Status SessionState::PrepackConstantInitializedTensors(
+    PrepackedForSerialization::Subgraph& prepacked_subgraph,
+    InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
+    const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
+  auto prepacked_constant_weights = [this, &prepacked_subgraph,
+                                     &constant_initializers_use_count, &initializers_to_share_map](
                                         bool should_cache_prepacked_weights_for_shared_initializers) -> Status {
     for (auto& node : GetGraphViewer().Nodes()) {
       auto kernel = GetMutableKernel(node.Index());
@@ -423,7 +427,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
 
                 // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now
                 if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
-                    node.GetExecutionProviderType() == kCpuExecutionProvider) {  // caching of pre-packed weights' turned ON
+                    node.GetExecutionProviderType() == kCpuExecutionProvider) {
+                  // caching of pre-packed weights' turned ON
 
                   AllocatorPtr allocator_for_caching = prepacked_weights_container_->GetOrCreateAllocator(CPU);
                   ORT_ENFORCE(allocator_for_caching.get() != nullptr);
@@ -431,16 +436,19 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                   PrePackedWeights weights_to_be_filled_in;
                   // The reason we invoke PrePack() before looking into the container for any pre-packed weight
                   // cached by another instance of the same op_type (for the same constant initializer) is because
-                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached pre-packed
-                  // weight with the pre-packed weight generated by this instance of the same op_type because other static
-                  // properties of the node like node attributes could play a role in the pre-packed weights' contents.
+                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached
+                  // pre-packed  weight with the pre-packed weight generated by this instance of the same op_type
+                  // because other static properties of the node like node attributes could play a role in the
+                  // pre-packed weights' contents.
                   ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, allocator_for_caching,
                                                       is_packed,
                                                       &weights_to_be_filled_in));
 
                   if (is_packed) {
-                    // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight to be cached if the weight was pre-packed
-                    ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0, "The kernel corresponding to the node ", node.Name(),
+                    // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight
+                    // to be cached if the weight was pre-packed
+                    ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0,
+                                "The kernel corresponding to the node ", node.Name(),
                                 " doesn't have an implementation that can cache computed pre-packed weights");
 
                     const auto& op_type = node.OpType();
@@ -452,40 +460,117 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                     // The key for the pre-packed weights container lookup is the op_type + hash of the prepacked-weight
                     // that we just got by invoking PrePack() on this kernel.
 
-                    const std::string& prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap(op_type,
-                                                                                                           weights_to_be_filled_in);
+                    const std::string prepacked_weights_container_key =
+                        GenerateKeyForPrepackedWeightsMap(op_type,
+                                                          weights_to_be_filled_in);
 
-                    bool container_contains_packed_weight = prepacked_weights_container_->HasWeight(prepacked_weights_container_key);
+                    bool container_contains_packed_weight = prepacked_weights_container_->HasWeight(
+                        prepacked_weights_container_key);
 
                     if (container_contains_packed_weight) {
-                      LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: " << input_name
-                                          << " used in the node: " << node.Name() << " which is of op type: " << node.OpType();
+                      LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: "
+                                          << input_name
+                                          << " used in the node: " << node.Name() << " which is of op type: "
+                                          << node.OpType();
 
+                      const auto& prepacked_shared = prepacked_weights_container_->GetWeight(
+                          prepacked_weights_container_key);
                       ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
-                                                                          prepacked_weights_container_->GetWeight(prepacked_weights_container_key),
+                                                                          prepacked_shared,
                                                                           node.Name()));
 
                       ++used_shared_pre_packed_weights_counter_;
-                    } else {  // container doesn't contain the pre-packed weight - so write into it for sharing across kernel instances
 
-                      if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key, std::move(weights_to_be_filled_in))) {
-                        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to write the provided PrePackedWeights instance into the container");
+                      // Write references to what is stored in the shared container
+                      // and release memory mapped entries this container may have loaded from disk
+                      std::ignore = prepacked_subgraph.ReplaceWithReferenceIfSaving(input_name,
+                                                                                    prepacked_weights_container_key,
+                                                                                    prepacked_shared);
+
+                    } else {
+                      // container doesn't contain the pre-packed weight - so write into it for sharing across
+                      // kernel instances
+
+                      // Check if we loaded it from disk, then put it into the shared container so
+                      // everybody can share the same memory mapped entry
+                      // the shared container takes ownership of the memory mapped entries
+
+                      // The next like replaces the existing entry with references to it
+                      // and returns the container that holds the memory mapped entries
+                      // so we can transfer it to shared container.
+                      // if there is not an entry, we replace it with references to weights_to_be_filled_in
+                      // in saving mode and return std::nullopt
+                      auto prepacked_from_disk = prepacked_subgraph.ReplaceWithReferenceIfSaving(
+                          input_name,
+                          prepacked_weights_container_key,
+                          weights_to_be_filled_in);
+
+                      if (prepacked_from_disk.has_value()) {
+                        weights_to_be_filled_in = std::move(*prepacked_from_disk);
                       }
 
+                      if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key,
+                                                                     std::move(weights_to_be_filled_in))) {
+                        return ORT_MAKE_STATUS(
+                            ONNXRUNTIME, FAIL,
+                            "Unable to write the provided PrePackedWeights instance into the container");
+                      }
+
+                      const auto& shared_prepacked = prepacked_weights_container_->GetWeight(
+                          prepacked_weights_container_key);
                       ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
-                                                                          prepacked_weights_container_->GetWeight(prepacked_weights_container_key),
+                                                                          shared_prepacked,
                                                                           node.Name()));
                     }
                   }
 
-                } else {  // caching of pre-packed weights' turned OFF
+                } else {
+                  // cross session caching of pre-packed weights' turned OFF
+                  // we use serialization container to share weights loaded from disk
+                  // within this session. Or if the weight is not present on disk,
+                  // we store the newly minted pre-packed data.
+
                   AllocatorPtr session_cpu_alloc = GetAllocator(kernel->Info().GetDevice(OrtMemType::OrtMemTypeDefault));
-                  ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx,
-                                                      session_cpu_alloc,  // use allocator tied to this session
+                  PrePackedWeights weights_to_be_filled_in;
+                  // The reason we invoke PrePack() before looking into the container for any pre-packed weight
+                  // cached by another instance of the same op_type (for the same constant initializer) is because
+                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached
+                  // pre-packed weight with the pre-packed weight generated by this instance of the same op_type because
+                  // other static properties of the node like node attributes could play a role in the pre-packed
+                  // weights' contents.
+                  ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, session_cpu_alloc,
                                                       is_packed,
-                                                      nullptr  // no caching required
-                                                      ));
+                                                      &weights_to_be_filled_in));
+
+                  // Some kernels (matmul_nbits and non-CPU related kernels) do not share their pre-packed results
+                  // even though they set is_packed = true so we leave it up to them.
+                  // We can change their behavior if we wish do so in a separate PR
+                  // XXX: Interestingly enough, matmul_nbits does accept shared pre-packs, but does not
+                  // produce them.
+                  if (is_packed && !weights_to_be_filled_in.buffers_.empty()) {
+                    const auto& op_type = node.OpType();
+                    const std::string prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap(
+                        op_type,
+                        weights_to_be_filled_in);
+
+                    // See if we can use pre-packed data from disk
+                    const auto* weights_to_use = prepacked_subgraph.GetPrepackedWeights(
+                        prepacked_weights_container_key);
+
+                    if (weights_to_use == nullptr) {
+                      // In this case pre-packed container owns the data
+                      prepacked_subgraph.WritePacked(input_name, prepacked_weights_container_key,
+                                                     std::move(weights_to_be_filled_in));
+                      weights_to_use = prepacked_subgraph.GetPrepackedWeights(prepacked_weights_container_key);
+                      assert(weights_to_use != nullptr);
+                    }
+
+                    ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
+                                                                        *weights_to_use,
+                                                                        node.Name()));
+                  }
                 }
+
                 if (is_packed) {
                   ++number_of_prepacks_counter_;
 
@@ -525,7 +610,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
   }
 }
 
-static int64_t CalculateMemoryPatternsKey(const gsl::span<const OrtValue>& tensor_inputs) {
+static int64_t
+CalculateMemoryPatternsKey(const gsl::span<const OrtValue>& tensor_inputs) {
   int64_t key = 0;
   for (const auto& input : tensor_inputs) {
     for (auto dim : input.Get<Tensor>().Shape().GetDims()) key ^= dim;
@@ -1068,9 +1154,12 @@ Status SessionState::CreateSubgraphSessionState() {
 
 // Calculate the use count of a constant initialized tensor, including the use in subgraph.
 // Note: This function doesn't handle the case below:
-// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X, which overrides the X from main graph.
-// For case like this, the current implementation will calculate the use count as 2, but they could contain completely different values so each should have a use count of 1.
-// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory usage of X won't be saved. This will be fine.
+// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X,
+// which overrides the X from main graph.
+// For case like this, the current implementation will calculate the use count as 2, but they could contain completely
+// different values so each should have a use count of 1.
+// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory
+// usage of X won't be saved. This will be fine.
 static void ComputeConstantInitializerUseCount(const Graph& graph, InlinedHashMap<std::string, size_t>& constant_initializers_use_count) {
   for (const auto& node : graph.Nodes()) {
     for (const auto* arg : node.InputDefs()) {
@@ -1189,7 +1278,31 @@ Status SessionState::FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE
   InlinedHashMap<std::string, size_t> constant_initializers_use_count;
   ComputeConstantInitializerUseCount(graph_, constant_initializers_use_count);
   return FinalizeSessionStateImpl(graph_location, kernel_registry_manager, nullptr, sess_options_,
-                                  remove_initializers, constant_initializers_use_count);
+                                  remove_initializers,
+                                  constant_initializers_use_count,
+                                  prepacked_weights_for_serialization_.MainGraph());
+}
+
+void SessionState::SetSaveModeForPrepacks(bool saving_model,
+                                          bool saving_ort_format) {
+  bool save_prepacked_constant_initializers =
+      sess_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers,
+                                                      "0") == "1";
+
+  if (save_prepacked_constant_initializers && !saving_model) {
+    save_prepacked_constant_initializers = false;
+    LOGS(logger_, WARNING)
+        << "SavePrePackedConstantInitializers is set to true but the model is not being saved. Ignoring the flag.";
+  }
+
+  if (save_prepacked_constant_initializers && saving_ort_format) {
+    save_prepacked_constant_initializers = false;
+    LOGS(logger_, WARNING)
+        << "Serializing optimized model in ORT format with external pre-packed constant initializers is not supported."
+        << " Ignoring the flag.";
+  }
+
+  prepacked_weights_for_serialization_.SetSaveMode(save_prepacked_constant_initializers);
 }
 
 static Status Index(const OrtValueNameIdxMap& ort_value_name_idx_map,
@@ -1323,6 +1436,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
                                               const SessionOptions& session_options,
                                               bool remove_initializers,
                                               InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
+                                              PrepackedForSerialization::Subgraph& prepacked_subgraph,
                                               const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map,
                                               bool graph_info_already_created) {
   if (!graph_info_already_created) {
@@ -1489,7 +1603,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
             return Status::OK();
           },
           logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
-          memory_profile_func, name_to_buffered_tensor_));
+          memory_profile_func, prepacked_subgraph, name_to_buffered_tensor_));
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   // Record Weight allocation info on device
@@ -1506,7 +1620,8 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   ORT_RETURN_IF_ERROR(CreateKernels(kernel_registry_manager));
 
   if (!disable_prepacking) {
-    ORT_RETURN_IF_ERROR(PrepackConstantInitializedTensors(constant_initializers_use_count,
+    ORT_RETURN_IF_ERROR(PrepackConstantInitializedTensors(prepacked_subgraph,
+                                                          constant_initializers_use_count,
                                                           session_options.initializers_to_share_map));
   }
 
@@ -1544,9 +1659,13 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
                                                                node,
                                                                subgraph_session_state.GetGraphViewer(),
                                                                subgraph_outer_scope_node_arg_to_location_map));
+
+      auto& next_prepacked_subgraph = prepacked_subgraph.GetOrCreateSubgraph(
+          subgraph_session_state.GetGraphViewer().GetGraph());
       ORT_RETURN_IF_ERROR(subgraph_session_state.FinalizeSessionStateImpl(
           graph_location, kernel_registry_manager, &node, subgraph_session_options, remove_initializers,
-          constant_initializers_use_count, subgraph_outer_scope_node_arg_to_location_map, true));
+          constant_initializers_use_count, next_prepacked_subgraph,
+          subgraph_outer_scope_node_arg_to_location_map, true));
 
       // setup all the info for handling the feeds and fetches used in subgraph execution
       auto* p_op_kernel = GetMutableKernel(node.Index());
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index e1674ba4b690b..3e07be4b14dfa 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -364,6 +364,20 @@ class SessionState {
 
   const SessionOptions& GetSessionOptions() const { return sess_options_; }
 
+  /// <summary>
+  /// Deduce the flag whether we need to enable or disable
+  /// saving for serialization mode and create the member container with
+  /// the corresponding argument.
+  /// </summary>
+  /// <param name="saving_model"></param>
+  /// <param name="saving_ort_format"></param>
+  void SetSaveModeForPrepacks(bool saving_model,
+                              bool saving_ort_format);
+
+  const PrepackedForSerialization& GetPrepackedForSerialization() const {
+    return prepacked_weights_for_serialization_;
+  }
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState);
 
@@ -381,7 +395,8 @@ class SessionState {
    * Prepack the constant initialized tensors for better performance.
    * The original constant initialized tensors will be removed to save memory.
    */
-  Status PrepackConstantInitializedTensors(InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
+  Status PrepackConstantInitializedTensors(PrepackedForSerialization::Subgraph& prepacked_subgraph,
+                                           InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
                                            const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map);
 
   SessionState* GetMutableSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name);
@@ -400,6 +415,7 @@ class SessionState {
                                   const SessionOptions& session_options,
                                   bool remove_initializers,
                                   InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
+                                  PrepackedForSerialization::Subgraph& prepacked_subgraph,
                                   const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map = {},
                                   bool graph_info_already_created = false);
 
@@ -528,6 +544,9 @@ class SessionState {
   // the cache is valid until any session reliant on it is still in scope.
   // prepacked_weights_container_ can be nullptr if no caching is required for prepacked weights
   PrepackedWeightsContainer* const prepacked_weights_container_{};
+  // This container serves either for reading and using pre-packed weights from disk
+  // of serializing to disk
+  PrepackedForSerialization prepacked_weights_for_serialization_;
 
 #ifdef ENABLE_TRAINING
 // Needed for ORTTrainer. Should be removed along with ORTTrainer code
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index 2c74805c57dce..b164fbfe72340 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -68,18 +68,19 @@ struct ExtDataValueDeleter {
 // buffered_tensor is not null, buffered_tensor holds the real buffer pointed
 // by tensor_proto. buffered_tensor must be the owner of the buffer and deleter
 // should release the buffer when tensor_proto is released.
-static inline common::Status ExtDataTensorProtoToTensor(const Env& env,
-                                                        const std::basic_string<PATH_CHAR_TYPE>& proto_path,
-                                                        const ONNX_NAMESPACE::TensorProto& tensor_proto,
-                                                        Tensor& tensor, OrtCallback& ext_data_deleter,
-                                                        Tensor* buffered_tensor = nullptr) {
+static common::Status ExtDataTensorProtoToTensor(const Env& env,
+                                                 const std::basic_string<PATH_CHAR_TYPE>& proto_path,
+                                                 const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                                 Tensor& tensor, OrtCallback& ext_data_deleter,
+                                                 PrepackedForSerialization::Subgraph& prepacked_subgraph,
+                                                 Tensor* buffered_tensor = nullptr) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
 
   void* ext_data_buf = nullptr;
   SafeInt<size_t> ext_data_len = 0;
   ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto,
                                                        ext_data_buf, ext_data_len, ext_data_deleter,
-                                                       buffered_tensor));
+                                                       buffered_tensor, &prepacked_subgraph));
 
   // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be
   // avoided if the Tensor class implements the do-nothing behavior when given a
@@ -100,6 +101,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
                                              const AllocatorPtr& alloc, const AllocatorPtr& default_cpu_alloc,
                                              OrtValue& ort_value, const DataTransferManager& data_transfer_mgr,
                                              const ExternalDataLoaderManager& external_data_loader_mgr,
+                                             PrepackedForSerialization::Subgraph& prepacked_subgraph,
                                              bool use_device_allocator_for_initializers = false,
                                              Tensor* buffered_tensor = nullptr) {
   if (bool(alloc) == (m != nullptr)) {
@@ -127,8 +129,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       ORT_RETURN_IF_ERROR(utils::LoadExtDataToTensorFromTensorProto(env, proto_path, tensor_proto,
                                                                     *external_data_loader, *p_tensor));
 
-      auto ml_tensor = DataTypeImpl::GetType<Tensor>();
-      ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
+      Tensor::InitOrtValue(std::move(*p_tensor), ort_value);
       return common::Status::OK();
     } else if (device_type == OrtDevice::CPU) {
       // for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance
@@ -139,7 +140,8 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       // TensorProtoToTensor it would copy the data, causing unnecessary overhead
       OrtCallback ext_data_deleter;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_tensor,
-                                                     ext_data_deleter, buffered_tensor));
+                                                     ext_data_deleter, prepacked_subgraph,
+                                                     buffered_tensor));
 
       ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()};
       MLDataType ml_tensor_type = DataTypeImpl::GetType<Tensor>();
@@ -163,8 +165,9 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       OrtCallback ext_data_deleter;
       std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor,
-                                                     ext_data_deleter, buffered_tensor));
-      scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter);
+                                                     ext_data_deleter, prepacked_subgraph,
+                                                     buffered_tensor));
+      scoped_ort_callback_invoker.emplace(ext_data_deleter);
       // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
 
       return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
@@ -272,6 +275,7 @@ common::Status SaveInitializedTensors(
     const ExecutionPlanBase& exec_plan,
     const SessionOptions& session_options,
     const MemoryProfileFunction& memory_profile_func,
+    PrepackedForSerialization::Subgraph& prepacked_subgraph,
     std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors) {
   LOGS(logger, INFO) << "Saving initialized tensors.";
   ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > -1, "OrtValue indexes should have been populated.");
@@ -401,6 +405,7 @@ common::Status SaveInitializedTensors(
 
       Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, (m.has_value()) ? &*m : nullptr, alloc,
                                          default_cpu_alloc, ort_value, data_transfer_mgr, external_data_loader_mgr,
+                                         prepacked_subgraph,
                                          use_device_allocator_for_initializers, p_tensor);
       if (!st.IsOK()) {
         std::ostringstream oss;
diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h
index af27f5caba0f4..4a978167e8c7a 100644
--- a/onnxruntime/core/framework/session_state_utils.h
+++ b/onnxruntime/core/framework/session_state_utils.h
@@ -9,6 +9,7 @@
 
 #include "core/common/const_pointer_container.h"
 #include "core/framework/allocator.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensor.h"
 #include "core/framework/tensor_allocator.h"
 #include "core/framework/session_options.h"
@@ -50,6 +51,7 @@ common::Status SaveInitializedTensors(
     const ExecutionPlanBase& exec_plan,
     const SessionOptions& session_options,
     const MemoryProfileFunction& memory_profile_func,
+    PrepackedForSerialization::Subgraph& prepacked_subgraph,
     std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors);
 
 common::Status AllocateTensor(
diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc
index 93146e66d9f24..88a11533ef227 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.cc
+++ b/onnxruntime/core/framework/tensor_external_data_info.cc
@@ -3,8 +3,13 @@
 
 #include "tensor_external_data_info.h"
 #include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
+#include "core/common/string_utils.h"
 #include "core/platform/path_lib.h"
 
+#include <vector>
+
 #ifdef _WIN32
 #include <Windows.h>
 #endif
@@ -14,8 +19,24 @@ using ::ONNX_NAMESPACE::StringStringEntryProto;
 namespace onnxruntime {
 Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>& input,
                                 std::unique_ptr<ExternalDataInfo>& out) {
+  auto str_to_int = [](const std::string& s, OFFSET_TYPE& result) -> Status {
+    char* end;
+#ifdef _WIN32
+    result = _strtoi64(s.c_str(), &end, 10);
+#else
+    result = OrtStrToPtrDiff(s.c_str(), &end);
+#endif
+    if (end != s.c_str() + s.length()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", s, " failed");
+    }
+    return Status::OK();
+  };
+
   out = std::make_unique<ExternalDataInfo>();
+  PrepackedInfos prepacked_infos;
+
   const int input_size = input.size();
+
   for (int i = 0; i != input_size; ++i) {
     StringStringEntryProto stringmap = input[i];
     if (!stringmap.has_key())
@@ -25,28 +46,110 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
     if (stringmap.key() == "location" && !stringmap.value().empty()) {
       out->rel_path_ = ToWideString(stringmap.value());
     } else if (stringmap.key() == "offset" && !stringmap.value().empty()) {
-      char* end;
-#ifdef _WIN32
-      out->offset_ = _strtoi64(stringmap.value().c_str(), &end, 10);
-#else
-      out->offset_ = OrtStrToPtrDiff(stringmap.value().c_str(), &end);
-#endif
-      if (end != stringmap.value().c_str() + stringmap.value().length())
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
+      ORT_RETURN_IF_ERROR(str_to_int(stringmap.value(), out->offset_));
     } else if (stringmap.key() == "length" && !stringmap.value().empty()) {
       char* end;
-      out->length_ = static_cast<size_t>(OrtStrToPtrDiff(stringmap.value().c_str(), &end));
+      out->length_ = narrow<size_t>(OrtStrToPtrDiff(stringmap.value().c_str(), &end));
       if (end != stringmap.value().c_str() + stringmap.value().length())
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
     } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) {
       out->checksum_ = stringmap.value();
+    } else if (stringmap.key().find("prepacked", 0) == 0) {
+      // Starts with 'prepacked', each has its own key.
+      // Each prepacked entry may have multiple blobs with the same key
+      // we output them with the same key
+      // format = key|offset;length;checksum[|offset;length;checksum]
+      // We are ignoring invalid entries (should not be any), and rely
+      // on in memory pre-packs regenerated in this case.
+      // users can over-write this file with the correct pre-packed info.
+      const std::string& prepacked = stringmap.value();
+      if (!prepacked.empty()) {
+        auto split_fields = utils::SplitString(prepacked, "|", false);
+        if (split_fields.size() > 1) {
+          const std::string key{split_fields[0]};
+          auto& blob_infos = prepacked_infos[key];
+          for (size_t f = 1; f < split_fields.size(); ++f) {
+            const auto& blob = split_fields[f];
+            auto blob_fields = utils::SplitString(blob, ";", false);
+            if (blob_fields.size() == 3) {
+              OFFSET_TYPE offset, len;
+              ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[0]), offset));
+              ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[1]), len));
+              blob_infos.push_back(std::make_tuple(offset, narrow<size_t>(len), std::string(blob_fields[2])));
+            }
+          }
+          if (blob_infos.empty()) {
+            prepacked_infos.erase(key);
+          }
+        }
+      }
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error!");
     }
   }
+
   if (out->rel_path_.empty()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error! Missing 'location'");
   }
+
+  if (!prepacked_infos.empty()) {
+    out->prepacked_infos_ = std::move(prepacked_infos);
+  }
+
   return Status::OK();
 }
+void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& external_file_path,
+                                                  int64_t external_offset, size_t tensor_bytes_size,
+                                                  ::ONNX_NAMESPACE::TensorProto& proto) {
+  proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+
+  auto* location = proto.add_external_data();
+  location->set_key("location");
+  location->set_value(ToUTF8String(external_file_path.native()));
+
+  auto* offset = proto.add_external_data();
+  offset->set_key("offset");
+  offset->set_value(std::to_string(external_offset));
+
+  auto* length = proto.add_external_data();
+  length->set_key("length");
+  length->set_value(std::to_string(tensor_bytes_size));
+}
+
+std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto(
+    const PrepackedForSerialization& prepacked_for_serialization,
+    const InlinedHashSet<std::string>& blob_keys, bool align, int64_t allocation_granularity,
+    std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) {
+  for (const auto& key : blob_keys) {
+    size_t prepack_count = 0;
+    const auto* prepacked_weights = prepacked_for_serialization.GetPrepackedForKey(key);
+    ORT_ENFORCE(prepacked_weights != nullptr, "Prepacked weights not found for key ", key);
+    std::stringstream prepacked_entry;
+    prepacked_entry << key << "|";
+    for (size_t i = 0, size = prepacked_weights->buffers_.size(); i < size; ++i) {
+      if (align) {
+        // return early on error
+        if (!AlignAndPad(os, allocation_granularity, external_offset)) {
+          return os;
+        }
+      }
+      const auto size_in_bytes = prepacked_weights->buffer_sizes_[i];
+      if (prepack_count++ > 0) {
+        prepacked_entry << "|";
+      }
+      // Checksum is currently not validated
+      prepacked_entry << external_offset << ";" << size_in_bytes << ";0";
+      if (!os.write(reinterpret_cast<const char*>(prepacked_weights->buffers_[i].get()), size_in_bytes)) {
+        return os;
+      }
+      external_offset = SafeInt<int64_t>(external_offset) + size_in_bytes;
+    }
+    auto* prepacked = proto.add_external_data();
+    std::string prepacked_key("prepacked_");
+    prepacked_key.append(std::to_string(prepack_count));
+    prepacked->set_key(std::move(prepacked_key));
+    prepacked->set_value(prepacked_entry.str());
+  }
+  return os;
+}
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h
index afc8fda6c3037..badd2137f3472 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.h
+++ b/onnxruntime/core/framework/tensor_external_data_info.h
@@ -2,12 +2,19 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include <filesystem>
+#include <ostream>
 #include <string>
+#include <tuple>
+
+#include <core/common/inlined_containers_fwd.h>
 #include "core/common/status.h"
+#include "core/common/path_string.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/graph/onnx_protobuf.h"
-#include "core/session/onnxruntime_c_api.h"
 
 namespace onnxruntime {
+
 class ExternalDataInfo {
  public:
 #ifdef _WIN32
@@ -16,7 +23,7 @@ class ExternalDataInfo {
   using OFFSET_TYPE = off_t;
 #endif
 
-  const std::basic_string<ORTCHAR_T>& GetRelPath() const { return rel_path_; }
+  const PathString& GetRelPath() const { return rel_path_; }
 
   OFFSET_TYPE GetOffset() const { return offset_; }
   size_t GetLength() const { return length_; }
@@ -29,12 +36,58 @@ class ExternalDataInfo {
       const ::google::protobuf::RepeatedPtrField<::ONNX_NAMESPACE::StringStringEntryProto>& input,
       std::unique_ptr<ExternalDataInfo>& out);
 
+  static void SetExternalLocationToProto(const std::filesystem::path& external_file_path,
+                                         int64_t offset,
+                                         size_t tensor_bytes_size,
+                                         ::ONNX_NAMESPACE::TensorProto& proto);
+
+  // Pads the output with zeros according to the specified allocation_granularity
+  // It updates external_offset for alignment.
+  // need to do padding before write actual tensor data as we do offset alignment at the begin of
+  // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
+  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
+  // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+  static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) {
+    // Align to the larger of the page size or the allocation granularity
+    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), allocation_granularity);
+    // Align to the next page or alloc granularity boundary
+    int64_t new_external_offset = static_cast<int64_t>(
+                                      std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
+                                  alignment_factor;
+
+    // padding tensor with zeros for alignment
+    for (int64_t index = external_offset; index != new_external_offset; ++index) {
+      stream << '\0';
+    }
+    external_offset = new_external_offset;
+    return stream;
+  }
+
+  static std::ostream& WritePrepackedToFileAndAddToProto(
+      const PrepackedForSerialization& prepacked_for_serialization,
+      const InlinedHashSet<std::string>& blob_keys,
+      bool align, int64_t allocation_granularity,
+      std::ostream& os,
+      int64_t& external_offset,
+      ::ONNX_NAMESPACE::TensorProto& proto);
+
+  using PrepackedInfo = std::tuple<OFFSET_TYPE, size_t, std::string>;
+  using PrepackedInfos = std::unordered_map<std::string, std::vector<PrepackedInfo>>;
+
+  bool HasPrepackedInfo() const noexcept { return !prepacked_infos_.empty(); }
+
+  PrepackedInfos&& TakePrepackedInfos() { return std::move(prepacked_infos_); }
+
  private:
-  std::basic_string<ORTCHAR_T> rel_path_;
+  PathString rel_path_;
   OFFSET_TYPE offset_ = 0;
 
   // 0 means the whole file
   size_t length_ = 0;
   std::string checksum_;
+
+  // Pre-packed blobs found associated with this TensorProto if present
+  // format key, offset, length, checksum
+  PrepackedInfos prepacked_infos_;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 2af9f95ad059e..f7806d0c1735a 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -234,7 +234,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            const std::filesystem::path& tensor_proto_dir,
                            std::basic_string<ORTCHAR_T>& external_file_path,
                            onnxruntime::FileOffsetType& file_offset,
-                           SafeInt<size_t>& tensor_byte_size) {
+                           SafeInt<size_t>& tensor_byte_size,
+                           ExternalDataInfo::PrepackedInfos* prepacked_infos) {
   ORT_RETURN_IF_NOT(onnxruntime::utils::HasExternalData(tensor_proto),
                     "Tensor does not have external data to read from.");
 
@@ -258,6 +259,10 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
 
   file_offset = external_data_info->GetOffset();
 
+  if (prepacked_infos != nullptr && external_data_info->HasPrepackedInfo()) {
+    *prepacked_infos = external_data_info->TakePrepackedInfos();
+  }
+
   return Status::OK();
 }
 
@@ -988,7 +993,8 @@ static Status GetFileContent(const Env& env, const std::filesystem::path& file_p
 Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& model_path,
                                  const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf,
                                  SafeInt<size_t>& ext_data_len, OrtCallback& ext_data_deleter,
-                                 Tensor* buffered_tensor) {
+                                 Tensor* buffered_tensor,
+                                 PrepackedForSerialization::Subgraph* prepacked_info) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
   std::basic_string<ORTCHAR_T> tensor_proto_dir;
   if (!model_path.empty()) {
@@ -997,8 +1003,13 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
   std::basic_string<ORTCHAR_T> external_data_file_path;
   FileOffsetType file_offset;
   SafeInt<size_t> raw_data_safe_len = 0;
+  std::optional<ExternalDataInfo::PrepackedInfos> prepacked_infos;
+  if (prepacked_info != nullptr) {
+    prepacked_infos.emplace();
+  }
   ORT_RETURN_IF_ERROR(
-      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len));
+      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset,
+                          raw_data_safe_len, (prepacked_info != nullptr) ? &*prepacked_infos : nullptr));
 
   if (external_data_file_path == onnxruntime::utils::kTensorProtoMemoryAddressTag) {
     // the value in location is the memory address of the data
@@ -1042,6 +1053,33 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
     ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), file_offset, raw_data_safe_len,
                                        ext_data_buf, ext_data_deleter));
     ext_data_len = raw_data_safe_len;
+
+    if (prepacked_info != nullptr && !prepacked_infos->empty()) {
+      for (const auto& [key, blobs] : *prepacked_infos) {
+        PrePackedWeights prepacked_weights;
+        prepacked_weights.buffers_.reserve(blobs.size());
+        prepacked_weights.buffer_sizes_.reserve(blobs.size());
+        for (const auto& blob : blobs) {
+          const auto blob_offset = std::get<0>(blob);
+          const auto blob_length = std::get<1>(blob);
+          SafeInt<FileOffsetType> end_of_blob{blob_offset};
+          end_of_blob += blob_length;
+          ORT_RETURN_IF(blob_offset < 0 || static_cast<std::uintmax_t>(end_of_blob) > file_length,
+                        "Pre-packed blob: ", key, " offset: ", blob_offset, " file_length: ", file_length,
+                        " is out of bounds and can not read in full");
+          void* data_ptr;
+          OrtCallback data_deleter;
+          ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), blob_offset, blob_length,
+                                             data_ptr, data_deleter));
+          BufferUniquePtr data_ptr_unique{data_ptr, OrtCallbackInvoker(data_deleter)};
+          prepacked_weights.buffers_.push_back(std::move(data_ptr_unique));
+          prepacked_weights.buffer_sizes_.push_back(blob_length);
+        }
+        if (!blobs.empty()) {
+          prepacked_info->InsertFromDisk(key, std::move(prepacked_weights));
+        }
+      }
+    }
 #endif
   }
 
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index 262f7adaca1cb..bd122e3ff2cca 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -3,20 +3,21 @@
 
 #pragma once
 
-#include <vector>
-#include <type_traits>
-#include <string>
 #include <filesystem>
+#include <string>
+#include <type_traits>
+#include <vector>
 
 #ifndef SHARED_PROVIDER
 #include "core/common/common.h"
 #include "core/common/status.h"
 #include "core/common/safeint.h"
-#include "core/framework/endian_utils.h"
 #include "core/framework/allocator.h"
+#include "core/framework/endian_utils.h"
 #include "core/framework/external_data_loader.h"
-#include "core/framework/ort_value.h"
 #include "core/framework/mem_buffer.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensor_external_data_info.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/platform/env.h"
@@ -36,7 +37,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            const std::filesystem::path& tensor_proto_dir,
                            std::basic_string<ORTCHAR_T>& external_file_path,
                            onnxruntime::FileOffsetType& file_offset,
-                           SafeInt<size_t>& tensor_byte_size);
+                           SafeInt<size_t>& tensor_byte_size,
+                           ExternalDataInfo::PrepackedInfos* prepacked_infos = nullptr);
 /**
  * This function is used to convert the endianess of Tensor data.
  * Mostly, will be used in big endian system to support the model file
@@ -172,7 +174,8 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
                                          const ONNX_NAMESPACE::TensorProto& tensor_proto,
                                          void*& ext_data_buf, SafeInt<size_t>& ext_data_len,
                                          OrtCallback& ext_data_deleter,
-                                         Tensor* buffered_tensor = nullptr);
+                                         Tensor* buffered_tensor = nullptr,
+                                         PrepackedForSerialization::Subgraph* prepacked_blobs = nullptr);
 
 // Given a tensor proto with external data obtain a tensor using the specified custom external data loader.
 common::Status LoadExtDataToTensorFromTensorProto(const Env& env, const std::filesystem::path& model_path,
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index e8a5855b36496..758c717045537 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -18,6 +18,7 @@
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
 #include "core/framework/tensor_shape.h"
+#include "core/framework/tensor_external_data_info.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/graph/graph_flatbuffers_utils.h"
@@ -25,6 +26,7 @@
 #include "core/graph/indexed_sub_graph.h"
 #include "core/graph/model.h"
 #include "core/graph/model_load_utils.h"
+#include "core/graph/model_saving_options.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/graph/op.h"
 #include "core/graph/runtime_optimization_record_container.h"
@@ -4084,82 +4086,102 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const {
   return result;
 }
 
-ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                       const std::filesystem::path& model_file_path,
-                                                                       size_t initializer_size_threshold,
-                                                                       const OffsetAlignmentInfo& align_info) const {
-  GraphProto result;
-  ToGraphProtoInternal(result);
-  ORT_ENFORCE(external_file_path.is_relative());
-  // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could
-  // be empty. Else, save external data file in same directory as the model.
-  const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path;
+// A recursive function that does bottom up with subgraphs
+Status Graph::ToGraphProtoWithExternalInitiallizersImpl(
+    const std::filesystem::path& model_path,
+    const std::filesystem::path& external_file_path,
+    const std::filesystem::path& modified_external_file_path,
+    const ModelSavingOptions& model_saving_options,
+    WeightToPrePacksMap& unprocessed_prepacks,
+    ONNX_NAMESPACE::GraphProto& output_graph_proto,
+    std::ostream& external_stream,
+    int64_t& external_offset) const {
+  // Process subgraphs
+  for (const auto& node : Nodes()) {
+    if (node.ContainsSubgraph()) {
+      // Let find this node in the output_graph_proto
+      auto hit = std::find_if(output_graph_proto.mutable_node()->begin(),
+                              output_graph_proto.mutable_node()->end(),
+                              [&node](const ONNX_NAMESPACE::NodeProto& proto) {
+                                return proto.name() == node.Name();
+                              });
+      ORT_RETURN_IF_NOT(hit != output_graph_proto.mutable_node()->end(), "Node ", node.Name(),
+                        " not found in output_graph_proto");
+      auto& result_node = *hit;
+      for (const auto& [name, subgraph] : node.GetAttributeNameToSubgraphMap()) {
+        // Lets find this subgraph in the result_node
+        auto sub_hit = std::find_if(result_node.mutable_attribute()->begin(),
+                                    result_node.mutable_attribute()->end(),
+                                    [&name](const ONNX_NAMESPACE::AttributeProto& proto) {
+                                      return proto.name() == name;
+                                    });
+        ORT_RETURN_IF_NOT(sub_hit != result_node.mutable_attribute()->end() && utils::HasGraph(*sub_hit),
+                          "Subgraph ", name, " not found in node ", node.Name());
+        auto& result_subgraph = *sub_hit->mutable_g();
+        ORT_RETURN_IF_ERROR(subgraph->ToGraphProtoWithExternalInitiallizersImpl(
+            model_path, external_file_path,
+            modified_external_file_path, model_saving_options,
+            unprocessed_prepacks, result_subgraph,
+            external_stream, external_offset));
+      }
+    }
+  }
 
-  std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
-  ORT_ENFORCE(external_stream.is_open());
-  int64_t external_offset = 0;
+  // Used only when pre-packed weights are serialized
+  InlinedHashSet<std::string> processed_weights;
+  const PrepackedForSerialization::Subgraph* prepacked_subgraph = nullptr;
+  bool process_prepacks = false;
+  if (model_saving_options.prepacked_for_save != nullptr) {
+    // Is there any pre-packed weights for this subgraph?
+    prepacked_subgraph = model_saving_options.prepacked_for_save->FindPrepackedGraph(*this);
+    if (prepacked_subgraph != nullptr) {
+      processed_weights.reserve(graph_proto_->initializer_size());
+      process_prepacks = true;
+    }
+  }
 
   // Add the initializers to the result graph.
-  const auto& model_path = ModelPath();
-#if !defined(DISABLE_SPARSE_TENSORS)
-  const auto sparse_end = sparse_tensor_names_.end();
-#endif
-
   for (const auto& initializer : graph_proto_->initializer()) {
 #if !defined(DISABLE_SPARSE_TENSORS)
-    if (sparse_end != sparse_tensor_names_.find(initializer.name())) {
+    if (IsSparseInitializer(initializer.name())) {
       // Sparse tensors are added to the ONNX file.
-      auto& sparse_initializer = *result.add_sparse_initializer();
+      auto& sparse_initializer = *output_graph_proto.add_sparse_initializer();
       auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer);
-      ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
+      ORT_RETURN_IF_NOT(status.IsOK(), "Failed to convert dense initializer to sparse");
     } else {
 #endif
       // Dense tensors larger than the threshold are added to the external file.
-      TensorProto* output_proto = result.add_initializer();
+      TensorProto* output_proto = output_graph_proto.add_initializer();
 
       std::vector<uint8_t> raw_data;
-      ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
+      ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
       size_t tensor_bytes_size = raw_data.size();
-      if (tensor_bytes_size < initializer_size_threshold) {
+      if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
         *output_proto = initializer;
+        if (process_prepacks) {
+          // These pre-packs will reside in memory
+          processed_weights.insert(initializer.name());
+        }
         continue;
       }
 
       // update external_offset for alignment
       // need to do padding before write actual tensor data as we do offset alignment at the begin of
-      // large tensors (offset need to be page aligned and alloction granularity aligned) like below:
+      // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
       // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-      // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-      if (align_info.align_offset && static_cast<int64_t>(tensor_bytes_size) > align_info.align_threshold) {
-        // Align to the larger of the page size or the allocation granularity
-        int64_t alignment_factor = std::max(static_cast<int64_t>(4096), align_info.allocation_granularity);
-        // Align to the next page or alloc granularity boundary
-        int64_t new_external_offset = static_cast<int64_t>(
-                                          std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
-                                      alignment_factor;
-
-        // padding tensor with zeros for alignment
-        for (int64_t index = external_offset; index != new_external_offset; ++index) {
-          external_stream << '0';
-        }
-
-        external_offset = new_external_offset;
+      // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+      if (model_saving_options.align_offset && static_cast<int64_t>(tensor_bytes_size) >
+                                                   model_saving_options.align_threshold) {
+        ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.allocation_granularity,
+                                                        external_offset),
+                          "Failed writing external data to: ", modified_external_file_path);
       }
 
-      for (size_t index = 0; index != tensor_bytes_size; ++index) {
-        external_stream << raw_data[index];
-      }
+      ORT_RETURN_IF_NOT(external_stream.write(reinterpret_cast<const char*>(raw_data.data()), tensor_bytes_size),
+                        "Failed to write external initializers to file: ", modified_external_file_path);
 
-      output_proto->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
-      ONNX_NAMESPACE::StringStringEntryProto* location = output_proto->add_external_data();
-      location->set_key("location");
-      location->set_value(ToUTF8String(external_file_path.native()));
-      ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto->add_external_data();
-      offset->set_key("offset");
-      offset->set_value(std::to_string(external_offset));
-      ONNX_NAMESPACE::StringStringEntryProto* length = output_proto->add_external_data();
-      length->set_key("length");
-      length->set_value(std::to_string(tensor_bytes_size));
+      ExternalDataInfo::SetExternalLocationToProto(external_file_path, external_offset,
+                                                   tensor_bytes_size, *output_proto);
 
       output_proto->set_name(initializer.name());
       output_proto->set_data_type(initializer.data_type());
@@ -4169,11 +4191,85 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std
       output_proto->set_doc_string(initializer.doc_string());
 
       external_offset += tensor_bytes_size;
+
+      if (process_prepacks) {
+        // check if this weight was referred to in subgraphs
+        InlinedHashSet<std::string> blob_keys_to_external_data;
+        auto hit = unprocessed_prepacks.find(initializer.name());
+        if (hit != unprocessed_prepacks.end()) {
+          blob_keys_to_external_data = std::move(hit->second);
+          unprocessed_prepacks.erase(hit);
+        }
+
+        // See if this weight has any pre-prepacks referred to in this graph.
+        const auto* blobs_keys_for_weight = prepacked_subgraph->GetBlobsForWeight(initializer.name());
+        if (blobs_keys_for_weight != nullptr && !blobs_keys_for_weight->empty()) {
+          // Add all the blob_keys to the set of keys to process
+          blob_keys_to_external_data.insert(blobs_keys_for_weight->begin(), blobs_keys_for_weight->end());
+        }
+
+        if (!blob_keys_to_external_data.empty()) {
+          ORT_RETURN_IF_NOT(ExternalDataInfo::WritePrepackedToFileAndAddToProto(
+              *model_saving_options.prepacked_for_save, blob_keys_to_external_data,
+              model_saving_options.align_offset,
+              model_saving_options.allocation_granularity,
+              external_stream, external_offset, *output_proto));
+        }
+
+        processed_weights.insert(initializer.name());
+      }
+
 #if !defined(DISABLE_SPARSE_TENSORS)
     }
 #endif
   }
 
+  // Check if there are any pre-packed weights this graph refers to, but they have
+  // not been processed
+  if (process_prepacks) {
+    const auto& sorted_by_weights = prepacked_subgraph->GetSortedByWeightForWriting();
+    for (const auto& [weight_name, blob_keys] : sorted_by_weights) {
+      if (processed_weights.find(weight_name) == processed_weights.end()) {
+        unprocessed_prepacks[weight_name].insert(blob_keys.begin(), blob_keys.end());
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
+    const std::filesystem::path& external_file_path,
+    const std::filesystem::path& model_file_path,
+    const ModelSavingOptions& model_saving_options) const {
+  GraphProto result;
+  ToGraphProtoInternal(result);
+  ORT_ENFORCE(external_file_path.is_relative());
+  // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could
+  // be empty. Else, save external data file in same directory as the model.
+  const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path;
+  const auto& model_path = ModelPath();
+
+  // Create the external file.
+  std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
+  ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
+  int64_t external_offset = 0;
+
+  WeightToPrePacksMap unprocessed_prepacks;
+  ORT_THROW_IF_ERROR(ToGraphProtoWithExternalInitiallizersImpl(model_path, external_file_path,
+                                                               modified_external_file_path, model_saving_options,
+                                                               unprocessed_prepacks, result,
+                                                               external_stream, external_offset));
+
+  if (!external_stream.flush()) {
+    ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
+  }
+
+  if (!unprocessed_prepacks.empty()) {
+    ORT_THROW("BUG. Failed to process all of them pre-packed weights for outerscope external initializers: ",
+              modified_external_file_path);
+  }
+
   return result;
 }
 
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 1bae63b510563..be0531e6473fb 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -383,14 +383,12 @@ ModelProto Model::ToProto() const {
 
 ModelProto Model::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                        const std::filesystem::path& file_path,
-                                                       size_t initializer_size_threshold,
-                                                       const Graph::OffsetAlignmentInfo& align_info) const {
+                                                       const ModelSavingOptions& model_saving_options) const {
   ModelProto result(model_proto_);
   const auto& graph = *graph_;
   *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name,
                                                                          file_path,
-                                                                         initializer_size_threshold,
-                                                                         align_info);
+                                                                         model_saving_options);
   return result;
 }
 
@@ -607,16 +605,13 @@ template <typename T>
 static Status SaveModelWithExternalInitializers(Model& model,
                                                 const T& file_path,
                                                 const std::filesystem::path& external_file_name,
-                                                size_t initializer_size_threshold,
-                                                const Graph::OffsetAlignmentInfo& align_info) {
+                                                const ModelSavingOptions& save_options) {
   int fd = 0;
   Status status = Env::Default().FileOpenWr(file_path, fd);
   ORT_RETURN_IF_ERROR(status);
 
   ORT_TRY {
-    status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name,
-                                                 initializer_size_threshold,
-                                                 align_info);
+    status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name, save_options);
   }
   ORT_CATCH(const std::exception& ex) {
     ORT_HANDLE_EXCEPTION([&]() {
@@ -646,10 +641,8 @@ Status Model::Load(const PathString& file_path, std::shared_ptr<Model>& p_model,
 
 Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path,
                                            const std::filesystem::path& external_file_name,
-                                           size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info) {
-  return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold,
-                                           align_info);
+                                           const ModelSavingOptions& save_options) {
+  return SaveModelWithExternalInitializers(model, file_path, external_file_name, save_options);
 }
 
 Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) {
@@ -765,8 +758,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
                                            int fd,
                                            const std::filesystem::path& file_path,
                                            const std::filesystem::path& external_file_name,
-                                           size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info) {
+                                           const ModelSavingOptions& model_saving_options) {
   if (fd < 0) {
     return Status(ONNXRUNTIME, INVALID_ARGUMENT, "<fd> is less than 0.");
   }
@@ -774,8 +766,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
   ORT_RETURN_IF_ERROR(model.MainGraph().Resolve());
 
   auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, file_path,
-                                                                initializer_size_threshold,
-                                                                align_info);
+                                                                model_saving_options);
   google::protobuf::io::FileOutputStream output(fd);
   const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush();
   if (result) {
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 9bcec6f78ca08..21fa5a2ff97bf 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -20,6 +20,8 @@
 
 namespace onnxruntime {
 
+class PrepackedForSerialization;
+
 namespace fbs {
 struct Model;
 }  // namespace fbs
@@ -190,15 +192,7 @@ class Model {
   // initializer offset could be page aligned and allocation granularity aligned for mmap support.
   ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                                   const std::filesystem::path& file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const Graph::OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
-                                                                  const std::filesystem::path& file_path,
-                                                                  size_t initializer_size_threshold) const {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   static common::Status Save(Model& model, const PathString& file_path);
 
@@ -209,32 +203,13 @@ class Model {
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info);
-
-  static common::Status SaveWithExternalInitializers(Model& model,
-                                                     const std::filesystem::path& file_path,
-                                                     const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold) {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info);
-  }
-
-  static common::Status SaveWithExternalInitializers(Model& model,
-                                                     int fd,
-                                                     const std::filesystem::path& file_path,
-                                                     const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info);
+                                                     const ModelSavingOptions& save_options);
 
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      int fd,
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold) {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info);
-  }
+                                                     const ModelSavingOptions& save_options);
 
   static common::Status Load(std::istream& model_istream, ONNX_NAMESPACE::ModelProto* p_model_proto);
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 7ab93d56cfe26..1e0b73bcfa8aa 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -42,6 +42,8 @@ using ProviderType = const std::string&;
 class RandomGenerator;
 class IOnnxRuntimeOpSchemaCollection;
 
+struct ModelSavingOptions;
+
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
 namespace contrib {
 class PythonOpBase;
@@ -900,7 +902,11 @@ struct ProviderHost {
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
-  virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) = 0;
+  virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(
+      Model* p,
+      const std::filesystem::path& external_file_name,
+      const std::filesystem::path& file_path,
+      const ModelSavingOptions&) = 0;
   virtual const ModelMetaData& Model__MetaData(const Model* p) const noexcept = 0;
   virtual Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index a82ddfe64c64b..2fadf3c25549b 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -933,6 +933,8 @@ struct NodeUnit final {
   Node::EdgeConstIterator OutputEdgesEnd() const { return g_host->NodeUnit__OutputEdgesEnd(this); }
 };
 
+struct ModelSavingOptions;
+
 struct Model final {
   static std::unique_ptr<Model> Create(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
@@ -944,7 +946,12 @@ struct Model final {
   Graph& MainGraph() { return g_host->Model__MainGraph(this); }
 
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToProto() { return g_host->Model__ToProto(this); }
-  std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) { return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path, initializer_size_threshold); }
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToGraphProtoWithExternalInitializers(
+      const std::filesystem::path& external_file_name,
+      const std::filesystem::path& file_path, const ModelSavingOptions& model_saving_options) {
+    return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path,
+                                                               model_saving_options);
+  }
   const ModelMetaData& MetaData() const noexcept { return g_host->Model__MetaData(this); }
 
   Model() = delete;
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index 191d26f3ab269..e7b39546fda6a 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -9,6 +9,7 @@
 #include <locale>
 #include <string>
 
+#include "core/graph/model_saving_options.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "./vai_assert.h"
 
@@ -111,7 +112,9 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
   if (initializer_size_threshold == std::numeric_limits<size_t>::max()) {
     model_proto = model->ToProto();
   } else {
-    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold);
+    ModelSavingOptions model_saving_options{initializer_size_threshold};
+    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename),
+                                                              model_saving_options);
   }
   auto& metadata = model->MetaData();
   if (!metadata.empty()) {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index a60ee500a9898..996c5e4bdb723 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -38,6 +38,7 @@
 #include "core/framework/utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/optimizer/graph_transformer_utils.h"
 #include "core/optimizer/graph_transformer.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
@@ -2063,6 +2064,8 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
     }
 
+    session_state_->SetSaveModeForPrepacks(saving_model, saving_ort_format);
+
     ORT_RETURN_IF_ERROR_SESSIONID_(
         session_state_->FinalizeSessionState(model_location_, kernel_registry_manager_,
                                              // need to keep the initializers if saving the optimized model
@@ -2099,13 +2102,16 @@ common::Status InferenceSession::Initialize() {
           const size_t optimized_model_external_initializers_min_size_in_bytes =
               ParseStringWithClassicLocale<size_t>(session_options_.config_options.GetConfigOrDefault(
                   kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "1024"));
-          Graph::OffsetAlignmentInfo align_info;
-          align_info.align_offset = true;
+          ModelSavingOptions model_saving_options{optimized_model_external_initializers_min_size_in_bytes};
+          model_saving_options.align_offset = true;
+          const auto& prepacked_for_serialization = session_state_->GetPrepackedForSerialization();
+          model_saving_options.prepacked_for_save = (prepacked_for_serialization.IsSaveModeOn())
+                                                        ? &prepacked_for_serialization
+                                                        : nullptr;
           ORT_RETURN_IF_ERROR_SESSIONID_(Model::SaveWithExternalInitializers(*model_,
                                                                              session_options_.optimized_model_filepath,
                                                                              optimized_model_external_initializers_file_name,
-                                                                             optimized_model_external_initializers_min_size_in_bytes,
-                                                                             align_info));
+                                                                             model_saving_options));
         }
       }
     }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index c3832498af584..402e13c2c505b 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1071,7 +1071,14 @@ struct ProviderHostImpl : ProviderHost {
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToProto()); }
-  std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold)); };
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p,
+                                                                                          const std::filesystem::path& external_file_name,
+                                                                                          const std::filesystem::path& file_path,
+                                                                                          const ModelSavingOptions& model_saving_options) override {
+    return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToGraphProtoWithExternalInitializers(external_file_name,
+                                                                                                file_path,
+                                                                                                model_saving_options));
+  };
   const ModelMetaData& Model__MetaData(const Model* p) const noexcept override { return p->MetaData(); };
   Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) override { return Model::Load(file_path, model_proto); }
 
diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc
index d0bc088175755..98874874d50e9 100644
--- a/onnxruntime/test/framework/save_model_with_external_initializers.cc
+++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc
@@ -6,6 +6,7 @@
 #include "core/common/path_string.h"
 #include "core/framework/data_types.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/framework/tensorprotoutils.h"
 #include "test/test_environment.h"
 #include "test_utils.h"
@@ -23,15 +24,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
                                const std::filesystem::path& input_external_init_file,
                                const std::filesystem::path& output_onnx,
                                const std::filesystem::path& output_external_init_file,
-                               size_t initializer_size_threshold,
-                               const Graph::OffsetAlignmentInfo& align_info) {
+                               const ModelSavingOptions& model_saving_options) {
   auto logger = DefaultLoggingManager().CreateLogger("LoadSaveAndCompareModel");
   std::shared_ptr<Model> model;
   ORT_RETURN_IF_ERROR(Model::Load(input_onnx, model, nullptr, *logger));
   std::filesystem::remove(output_onnx);
   std::filesystem::remove(output_external_init_file);
-  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, initializer_size_threshold,
-                                                          align_info));
+  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file,
+                                                          model_saving_options));
 
   std::shared_ptr<Model> model_from_external;
   ORT_RETURN_IF_ERROR(Model::Load(output_onnx.native(), model_from_external, nullptr, *logger));
@@ -67,7 +67,7 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
     ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(*from_external_tensor_proto, model_path, from_external_tensor_proto_data));
     size_t from_external_tensor_proto_size = from_external_tensor_proto_data.size();
 
-    if (from_external_tensor_proto_size < initializer_size_threshold) {
+    if (from_external_tensor_proto_size < model_saving_options.initializer_size_threshold) {
       // 'Small' tensors should be embedded in the onnx file.
       ORT_RETURN_IF_NOT(from_external_tensor_proto->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_DEFAULT, "location mismatch");
     } else {
@@ -78,13 +78,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
     ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch");
     ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch");
 
-    if (align_info.align_offset) {
+    if (model_saving_options.align_offset) {
       for (const StringStringEntryProto& entry : from_external_tensor_proto->external_data()) {
         if (entry.has_key() && entry.has_value() && entry.key() == "offset") {
           size_t tensor_offset;
           std::stringstream stream(entry.value());
           stream >> tensor_offset;
-          ORT_RETURN_IF_NOT(tensor_offset % align_info.allocation_granularity == 0, "tensor offset not align");
+          ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.allocation_granularity == 0,
+                            "tensor offset not align");
         }
       }
     }
@@ -97,22 +98,35 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
 
 // Original model does not have external initializers
 TEST(SaveWithExternalInitializers, Mnist) {
-  Graph::OffsetAlignmentInfo align_info;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info));
+  ModelSavingOptions model_saving_options{100};
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/mnist.onnx"),
+      ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"),
+      ORT_TSTR("mnist_external_initializers.bin"),
+      model_saving_options));
 }
 
 // Original model has external initializers
 TEST(SaveWithExternalInitializers, ModelWithOriginalExternalData) {
-  Graph::OffsetAlignmentInfo align_info;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
+  ModelSavingOptions model_saving_options{0};
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/model_with_orig_ext_data.onnx"),
+      ORT_TSTR("model_with_orig_ext_data.onnx.data"),
+      ORT_TSTR("testdata/model_with_new_external_initializers.onnx"),
+      ORT_TSTR("model_with_new_external_initializers.bin"),
+      model_saving_options));
 }
 
 // Original model has external initializers, align offset
 TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffset) {
-  Graph::OffsetAlignmentInfo align_info;
-  align_info.align_offset = true;
-  align_info.align_threshold = 0;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
+  ModelSavingOptions model_saving_options{0};
+  model_saving_options.align_offset = true;
+  model_saving_options.align_threshold = 0;
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/model_with_orig_ext_data.onnx"),
+      ORT_TSTR("model_with_orig_ext_data.onnx.data"),
+      ORT_TSTR("testdata/model_with_new_external_initializers.onnx"),
+      ORT_TSTR("model_with_new_external_initializers.bin"), model_saving_options));
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 3e694020f796b..a7dcddfe61c00 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -15,6 +15,7 @@
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/graph/op.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -22,12 +23,73 @@
 #include "gtest/gtest.h"
 #include "test/test_environment.h"
 #include "test/util/include/default_providers.h"
+#include "test/util/include/file_util.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
 
 using namespace ONNX_NAMESPACE;
-using namespace std;
 namespace onnxruntime {
 
+struct InspectPrepackedSubgraph {
+};
+
+// This specialization is used by SessionStateTestSharedInitalizersWithPrePacking.TestPrepackedSerialization down below
+// to be called on the main graph
+template <>
+void PrepackedForSerialization::Subgraph::TestHarness<InspectPrepackedSubgraph>(InspectPrepackedSubgraph&) const {
+  auto inspect = [](const PrepackedForSerialization::Subgraph& sub) {
+    const size_t expected_subgraphs = (sub.Parent() == nullptr) ? 2U : 0U;
+    ASSERT_EQ(sub.GetSubgraphNum(), expected_subgraphs);
+
+    const size_t expected_prepacks_for_writing = (sub.Parent() == nullptr) ? 0U : 1U;
+    ASSERT_EQ(expected_prepacks_for_writing, sub.GetNumberOfWeightsForWriting());
+
+    const size_t expected_blobs_for_writing = (sub.Parent() == nullptr) ? 0U : 1U;
+    ASSERT_EQ(expected_blobs_for_writing, sub.GetNumberOfKeyedBlobsForWriting());
+
+    if (sub.Parent() != nullptr) {
+      const auto* blob_keys = sub.GetBlobsForWeight("if_shared");
+      ASSERT_TRUE(blob_keys != nullptr);
+      ASSERT_EQ(blob_keys->size(), 1U);
+      const auto* prepacked_weights = sub.GetPrepackedWeights(*blob_keys->cbegin());
+      ASSERT_TRUE(prepacked_weights != nullptr);
+      ASSERT_EQ(prepacked_weights->buffer_sizes_.size(), 1U);
+      ASSERT_EQ(prepacked_weights->buffer_sizes_[0], sizeof(float) * 2);
+    }
+  };
+
+  inspect(*this);
+  for (const auto& subgraph : subgraph_prepacks_) {
+    inspect(*subgraph.second);
+  }
+}
+
+struct InspectLoadedSharedPrepacked {
+};
+
+template <>
+void PrepackedForSerialization::Subgraph::TestHarness<InspectLoadedSharedPrepacked>(
+    InspectLoadedSharedPrepacked&) const {
+  auto inspect = [this](const PrepackedForSerialization::Subgraph& sub) {
+    const size_t expected_subgraphs = (sub.Parent() == nullptr) ? 2U : 0U;
+    ASSERT_EQ(expected_subgraphs, sub.GetSubgraphNum());
+
+    // We are expecting to load only one shared pre-packed weight
+    // Saving is off, so we should not have any pre-packed weights for writing
+    const size_t expected_prepacks_for_writing = 0U;
+    ASSERT_EQ(expected_prepacks_for_writing, sub.GetNumberOfWeightsForWriting());
+
+    const size_t expected_blobs_for_writing = 0U;
+    ASSERT_EQ(expected_blobs_for_writing, sub.GetNumberOfKeyedBlobsForWriting());
+
+    ASSERT_EQ(0U, key_to_blobs_.size());
+  };
+
+  inspect(*this);
+  for (const auto& subgraph : subgraph_prepacks_) {
+    inspect(*subgraph.second);
+  }
+}
+
 namespace test {
 class TestOpKernel : public OpKernel {
  public:
@@ -378,7 +440,7 @@ class PrePackingTestOpKernel : public OpKernel {
     ORT_UNUSED_PARAMETER(tensor);
     ORT_UNUSED_PARAMETER(input_idx);
 
-    size_t weight_packed_len = 8;
+    constexpr const size_t weight_packed_len = sizeof(float) * 2;
     weight_packed_ = IAllocator::MakeUniquePtr<void>(alloc, weight_packed_len, true);
     float* data_weights_packed = reinterpret_cast<float*>(weight_packed_.get());
     data_weights_packed[0] = 1.2345f;
@@ -647,7 +709,8 @@ class SessionStateTestSharedInitalizersWithPrePacking : public ::testing::Test {
   }
 };
 
-// Pre-packing enabled + no shared initializers = no pre-packed weights caching
+// Pre-packing enabled + no shared initializers, however, we put all the pre-packs
+// in a session_state container for ownership.
 TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
   SessionOptions sess_options;
   sess_options.enable_mem_pattern = true;
@@ -679,10 +742,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
 
   const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made. However, they sharing call is still made from a serialized container.
   ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  // In this case the sharing comes from the serialized container
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 
   // Second session/model
   Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
@@ -706,10 +770,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
 
   kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_2.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made. The weights are still shared from the serialized container
+  // either because they are loaded from disk or because the container takes ownership of them.
   ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 }
 
 // Pre-packing enabled + shared initializers + no pre-packed weights container = no pre-packed weights caching
@@ -754,10 +819,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) {
 
   const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made, but sharing still takes place from the serialized container
   ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 
   // Second session/model
   Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
@@ -781,10 +846,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) {
 
   kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_2.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made, but sharing still takes place from the serialized container
   ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 }
 
 // Pre-packing enabled + shared initializers + pre-packed weights container = pre-packed weights caching enabled
@@ -999,6 +1064,215 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test4) {
   ASSERT_EQ(if_node_branches_shared_prepack_counter_2, static_cast<size_t>(2));
 }
 
+// sharing is on
+TEST_F(SessionStateTestSharedInitalizersWithPrePacking, TestPrepackedSerialization) {
+  const std::filesystem::path model_with_external_initializers =
+      "testdata/test_prepacked_serialization_optimized_model.onnx";
+
+  const std::filesystem::path external_initializers_file =
+      "test_prepacked_serialization_optimized_model.bin";
+
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+    sess_options.optimized_model_filepath = model_with_external_initializers;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+    // Enable saving model with pre-packed weights
+    sess_options.config_options.configurations[kOrtSessionOptionsSavePrePackedConstantInitializers] = "1";
+
+    // Enable shared initializer
+    OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
+    std::vector<float> float_data(1, 1);
+    auto value = std::make_unique<OrtValue>();
+    Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
+                         reinterpret_cast<void*>(float_data.data()), mem_info, *value);
+
+    ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+    Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                  domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+                  DefaultLoggingManager().DefaultLogger());
+
+    CreateGraphWithSubgraph(model_1.MainGraph());
+    PlaceAllNodesToCPUEP(model_1.MainGraph());
+    SessionState session_state_1(model_1.MainGraph(),
+                                 execution_providers,
+                                 tp.get(),
+                                 nullptr, /*inter_op_thread_pool*/
+                                 dtm,
+                                 edlm,
+                                 DefaultLoggingManager().DefaultLogger(),
+                                 profiler,
+                                 sess_options,
+                                 &prepacked_weights_container);
+
+    constexpr const bool saving_model_true = true;
+    constexpr const bool saving_ort_model_false = false;
+    session_state_1.SetSaveModeForPrepacks(saving_model_true, saving_ort_model_false);
+
+    ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
+                                                          kernel_registry_manager,
+                                                          !saving_model_true));
+
+    const auto& prepacked_for_serialization = session_state_1.GetPrepackedForSerialization();
+    ASSERT_TRUE(prepacked_for_serialization.IsSaveModeOn());
+    ASSERT_EQ(prepacked_for_serialization.GetNumberOfKeyedBlobs(), 1U);
+    // In this case we have two references to this blob from two subgraphs.
+    // It would save it on disk twice, but we will de-dupe it on load.
+    const auto& main_graph = prepacked_for_serialization.MainGraph();
+    InspectPrepackedSubgraph inspector;
+    main_graph.TestHarness<InspectPrepackedSubgraph>(inspector);
+
+    ModelSavingOptions model_saving_options{4};
+    model_saving_options.align_offset = true;
+    model_saving_options.prepacked_for_save = &session_state_1.GetPrepackedForSerialization();
+
+    ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model_1, model_with_external_initializers,
+                                                         external_initializers_file,
+                                                         model_saving_options));
+  }
+  ScopedFileDeleter test_model_deleter(model_with_external_initializers);
+  ScopedFileDeleter binary_file_deleter(external_initializers_file);
+
+  // Now let's load the model along with the initializers
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    // We are expecting this weight to be loaded from disk along
+    // with its pre-packed version
+    // Enable shared initializer
+    OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
+    std::vector<float> float_data(1, 1);
+    auto value = std::make_unique<OrtValue>();
+    Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
+                         reinterpret_cast<void*>(float_data.data()), mem_info, *value);
+
+    ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               &prepacked_weights_container);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
+                                                        kernel_registry_manager,
+                                                        false));
+
+    const auto& prepacked_for_serialization = session_state.GetPrepackedForSerialization();
+    ASSERT_FALSE(prepacked_for_serialization.IsSaveModeOn());
+    // Everything comes from the shared container
+    ASSERT_EQ(prepacked_for_serialization.GetNumberOfKeyedBlobs(), 0U);
+
+    InspectLoadedSharedPrepacked inspector;
+    const auto& main_graph = prepacked_for_serialization.MainGraph();
+    main_graph.TestHarness<InspectLoadedSharedPrepacked>(inspector);
+  }
+
+  // Load again, this time sharing is enabled, but no shared initializer in the map
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               &prepacked_weights_container);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers,
+                                                        kernel_registry_manager,
+                                                        false));
+
+    const auto& prepacked_for_serialization = session_state.GetPrepackedForSerialization();
+    ASSERT_FALSE(prepacked_for_serialization.IsSaveModeOn());
+    // Everything comes from the shared container
+    ASSERT_EQ(prepacked_for_serialization.GetNumberOfKeyedBlobs(), 1U);
+  }
+  // Load again, sharing is disabled
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               nullptr);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers,
+                                                        kernel_registry_manager,
+                                                        false));
+
+    const auto& prepacked_for_serialization = session_state.GetPrepackedForSerialization();
+    ASSERT_FALSE(prepacked_for_serialization.IsSaveModeOn());
+    // Everything comes from the shared container
+    ASSERT_EQ(prepacked_for_serialization.GetNumberOfKeyedBlobs(), 1U);
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(SessionStateTests,
                          SessionStatePrepackingTest,
                          testing::Values(PrepackingTestParam{false, false},
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 6821f582ce2de..6372756f5b1f8 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/inlined_containers.h"
+#include "core/framework/prepacked_weights.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/onnx_protobuf.h"
 #include "test/util/include/asserts.h"
@@ -19,6 +22,74 @@ using namespace ONNX_NAMESPACE;
 namespace onnxruntime {
 namespace test {
 
+// Test ExternalData functionality
+TEST(TensorProtoUtilsTest, SetExternalDataInformation) {
+  ONNX_NAMESPACE::TensorProto tensor_proto;
+  const std::filesystem::path kExternalDataPath("test.bin");
+  const int64_t init_offset = 100;
+  const size_t init_length = 200;
+
+  ExternalDataInfo::SetExternalLocationToProto(kExternalDataPath, init_offset, init_length, tensor_proto);
+
+  ASSERT_EQ(tensor_proto.data_location(), ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+  ASSERT_EQ(tensor_proto.external_data_size(), 3);
+  ASSERT_EQ(tensor_proto.external_data(0).key(), "location");
+  ASSERT_EQ(tensor_proto.external_data(0).value(), ToUTF8String(kExternalDataPath.native()));
+  ASSERT_EQ(tensor_proto.external_data(1).key(), "offset");
+  ASSERT_EQ(tensor_proto.external_data(1).value(), std::to_string(init_offset));
+  ASSERT_EQ(tensor_proto.external_data(2).key(), "length");
+  ASSERT_EQ(tensor_proto.external_data(2).value(), std::to_string(init_length));
+
+  PrepackedForSerialization prepacked_for_serialization;
+  prepacked_for_serialization.SetSaveMode(true);
+  PrePackedWeights prepacked_weights;
+  constexpr size_t buffer_size = 100;
+  const std::string init_name = "test_initializer";
+  const std::string blob_key = "test_key";
+
+  std::array<float, 2> kData = {1.2345f, 2.4690f};
+
+  prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr)));
+  prepacked_weights.buffer_sizes_.push_back(buffer_size);
+  // Write a second entry like this
+  prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr)));
+  prepacked_weights.buffer_sizes_.push_back(buffer_size);
+
+  prepacked_for_serialization.MainGraph().WritePacked(init_name, blob_key, std::move(prepacked_weights));
+
+  const int64_t starting_offset = 300;
+  int64_t external_offset = starting_offset;
+  std::stringstream ss;
+  const auto* blobs_for_weight = prepacked_for_serialization.MainGraph().GetBlobsForWeight(init_name);
+  ASSERT_TRUE(blobs_for_weight != nullptr);
+  InlinedHashSet<std::string> blob_keys{blobs_for_weight->begin(), blobs_for_weight->end()};
+  ASSERT_TRUE(ExternalDataInfo::WritePrepackedToFileAndAddToProto(prepacked_for_serialization,
+                                                                  blob_keys,
+                                                                  true, 0, ss, external_offset,
+                                                                  tensor_proto));
+
+  auto external_data_info = std::make_unique<ExternalDataInfo>();
+  ASSERT_STATUS_OK(ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info));
+
+  // This should have prepacked_data entry with two blobs for a single key.
+  ASSERT_TRUE(external_data_info->HasPrepackedInfo());
+  auto prepacked_infos = external_data_info->TakePrepackedInfos();
+  ASSERT_EQ(prepacked_infos.size(), 1U);
+  ASSERT_TRUE(prepacked_infos.count(blob_key) > 0);
+
+  int64_t final_offset = starting_offset;
+  for (const auto& blob_info : prepacked_infos[blob_key]) {
+    int64_t offset = std::get<0>(blob_info);
+    ASSERT_GT(offset, final_offset);
+    size_t length = std::get<1>(blob_info);
+    std::string checksum = std::get<2>(blob_info);  // currently "0"
+    final_offset = offset + length;
+    ASSERT_EQ(length, buffer_size);
+    ASSERT_EQ(checksum, "0");
+  }
+  ASSERT_EQ(final_offset, external_offset);
+}
+
 // T must be float for double, and it must match with the 'type' argument
 template <typename T>
 void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::path& model_path) {
diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc
index f1545e96481fa..b03f1b1eadb3b 100644
--- a/orttraining/orttraining/core/session/training_session.cc
+++ b/orttraining/orttraining/core/session/training_session.cc
@@ -5,6 +5,7 @@
 
 #include "core/framework/data_transfer_utils.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/session/IOBinding.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
 #include "core/providers/cpu/controlflow/utils.h"
@@ -1003,7 +1004,8 @@ Status TrainingSession::SaveWithExternalInitializers(const PathString& model_uri
   std::remove(ToUTF8String(model_uri).c_str());
   std::remove(external_file_name.c_str());
 
-  return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, initializer_size_threshold);
+  ModelSavingOptions model_saving_options{initializer_size_threshold};
+  return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, model_saving_options);
 }
 
 Status TrainingSession::Save(const PathString& model_uri, TrainingSession::SaveOption opt) {
diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc
index 939e1de334e52..8f2d0f6531500 100644
--- a/orttraining/orttraining/training_api/module.cc
+++ b/orttraining/orttraining/training_api/module.cc
@@ -689,8 +689,10 @@ Status Module::ExportModelForInferencing(const std::string& inference_model_path
     std::string external_data_name =
         ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(ExternalCheckpointDataPath(ToPathString(inference_model_path)));
     PathString inference_model_pathstring = ToPathString(inference_model_path);
+    ModelSavingOptions model_saving_options{64};
     ORT_THROW_IF_ERROR(
-        Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name, 64));
+        Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name,
+                                            model_saving_options));
   } else {
     ORT_THROW_IF_ERROR(Model::Save(*inference_model, ToPathString(inference_model_path)));
   }