Skip to content

Commit

Permalink
Implement pre-packed blobs serialiation on disk
Browse files Browse the repository at this point in the history
 and pre-packed blobs sharing when weights sharing is not enabled.
 Memory map pre-packed blobs.
 Recurse into subgraphs in ToGraphProtoWithExternalInitializers
 to make sure all big weights are serialized along with their
 pre-packs that is to be shared between the subgraphs.
  • Loading branch information
yuslepukhin committed Dec 10, 2024
1 parent bf4d3e1 commit 938bccf
Show file tree
Hide file tree
Showing 28 changed files with 1,353 additions and 246 deletions.
4 changes: 3 additions & 1 deletion include/onnxruntime/core/framework/buffer_deleter.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "core/framework/allocator.h"

#include <functional>

namespace onnxruntime {

// TODO: Do we need this class or is IAllocator::MakeUniquePtr sufficient/better
Expand All @@ -31,6 +33,6 @@ class BufferDeleter {
AllocatorPtr alloc_{nullptr};
};

using BufferUniquePtr = std::unique_ptr<void, BufferDeleter>;
using BufferUniquePtr = std::unique_ptr<void, std::function<void(void*)>>;
using BufferNakedPtr = void*;
} // namespace onnxruntime
58 changes: 25 additions & 33 deletions include/onnxruntime/core/graph/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
#include "core/common/common.h"
#include "core/common/path_string.h"
#include "core/common/const_pointer_container.h"
#include "core/common/inlined_containers_fwd.h"
#if !defined(ORT_MINIMAL_BUILD)
#include "core/common/inlined_containers.h"
#endif
#include "core/common/inlined_containers_fwd.h"
#include "core/common/span_utils.h"
#include "core/common/status.h"
#include "core/common/logging/logging.h"
Expand All @@ -41,6 +41,7 @@ namespace onnxruntime {
class Graph;
struct IndexedSubGraph;
class Model;
struct ModelSavingOptions;
class OpSignature;

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
Expand Down Expand Up @@ -1153,29 +1154,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
const ONNX_NAMESPACE::GraphProto& ToGraphProto();
ONNX_NAMESPACE::GraphProto ToGraphProto() const;

// Options to align external initializer offset.
// For models running on CPU, ORT will try to use mmap to load external initializers.
// To use mmap, external initializer need to be offset aligned.
// ORT saves external initializers into signle data file, each initializer is accessed with
// offset(start position of initializer) and length(byte length of initializer) of the data file.
// To use mmap, each offset need to be aligned which means offset need to divisible by
// allocation granularity(64KB for windows and 4K for other OSes).
// With align_offset to true, ORT will align offset for large initializer when
// save ONNX model with external data file.
struct OffsetAlignmentInfo {
// Offset will always be page aligned and allocation granularity aligned for mmap support.
// This is done by padding previous tensor data with zeros keeping same length.
bool align_offset = false;
// Alignment threshold for size of data.
// Having a low threshold will waste file space for small initializers.
// Only when tensor's data size is > the page_align_threshold it will be force aligned.
// Default to 1MB.
int64_t align_threshold = 1048576;
// The allocation Granularity for mmap() support.
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
int64_t allocation_granularity = 65536;
};

/** Gets the GraphProto representation of this Graph
@param external_file_path File path of the binary file to use for initializers.
@param model_file_path path of the model file.
Expand All @@ -1186,15 +1164,7 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
*/
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold,
const OffsetAlignmentInfo& align_info) const;

ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold) const {
OffsetAlignmentInfo default_options;
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
}
const ModelSavingOptions& model_saving_options) const;

/** Gets the ISchemaRegistry instances being used with this Graph. */
IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
Expand Down Expand Up @@ -1519,6 +1489,28 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
std::optional<std::string_view> new_name);

/// <summary>
/// A map that is used to keep track of pre-packed blobs to be serialized
/// The implementation adds pre-packed external data references to the TensorProto
/// that contains the initializer data. However, it may be an outerscope initializer.
/// Thus we need to keep track of the pre-packed blobs that are not serialized in this
/// graph, so the parent can make sure it is being serialized.
///
/// The below map has <weight_name, std::vector<blob_key_name>>. This contains
/// the entries that are not serialized in this graph, and the parent must check in them
/// </summary>
using WeightToPrePacksMap = NodeHashMap<std::string, InlinedHashSet<std::string>>;

Status ToGraphProtoWithExternalInitiallizersImpl(
const std::filesystem::path& model_path,
const std::filesystem::path& external_file_path,
const std::filesystem::path& modified_external_file_path,
const ModelSavingOptions& model_saving_options,
WeightToPrePacksMap& unprocessed_prepacks,
ONNX_NAMESPACE::GraphProto& graph_proto,
std::ostream& external_stream,
int64_t& external_offset) const;

#endif

Version IrVersion() const noexcept {
Expand Down
44 changes: 44 additions & 0 deletions include/onnxruntime/core/graph/model_saving_options.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

namespace onnxruntime {

class PrepackedForSerialization;

// These options that affect how the model initializers are saved.
// This includes options to align external initializer offset.
// For models running on CPU, ORT will try to use mmap to load external
// initializers. To use mmap, external initializer need to be offset aligned.
// ORT saves external initializers into signle data file, each initializer is
// accessed with offset(start position of initializer) and length(byte length of
// initializer) of the data file. To use mmap, each offset need to be aligned
// which means offset need to divisible by allocation granularity(64KB for
// windows and 4K for other OSes). With align_offset to true, ORT will align
// offset for large initializer when save ONNX model with external data file.
struct ModelSavingOptions {
explicit ModelSavingOptions(size_t size_threshold)
: initializer_size_threshold(size_threshold) {}

// Mimimal initializer size in bytes to be externalized on disk
size_t initializer_size_threshold;
// Offset will always be page aligned and allocation granularity aligned for
// mmap support. This is done by padding previous tensor data with zeros
// keeping same length.
bool align_offset = false;
// Alignment threshold for size of data.
// Having a low threshold will waste file space for small initializers.
// Only when tensor's data size is > the page_align_threshold it will be force
// aligned. Default to 1MB.
int64_t align_threshold = 1048576;
// The allocation Granularity for mmap() support.
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
int64_t allocation_granularity = 65536;
// Optional pointer to a container of pre-packed initializers to be
// embedded into the external initializers, so they can also be loaded
// from disk.
const PrepackedForSerialization* prepacked_for_save = nullptr;
};

} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,15 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
"session.optimized_model_external_initializers_min_size_in_bytes";

// Use this config when save pre-packed constant initializers to an external data file.
// This allows to minimize ONNX model file size and memory map pre-packed initializers on
// model load.
// - "0": Default is not save pre-packed initializers to a data file.
// - "1": Save pre-packed constant initializers to an external data file.
// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1")
static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
"session.save_external_prepacked_constant_initializers";

// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
// "0": disable. (default)
Expand Down
8 changes: 6 additions & 2 deletions onnxruntime/core/framework/prepacked_weights.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <vector>

#include "core/common/basic_types.h"
#include "core/common/inlined_containers_fwd.h"
#include "core/framework/buffer_deleter.h"
#include "core/framework/tensor_shape.h"

Expand All @@ -16,11 +17,14 @@ struct PrePackedWeights final {
// Hence we hold them in container. It is upto the developer implementing each PrePack()
// method to define what gets stored in which position of the container.

std::vector<IAllocatorUniquePtr<void>> buffers_; // cache pre-packed buffers associated with the kernel
std::vector<size_t> buffer_sizes_; // cache sizes of pre-packed buffers (in bytes)
InlinedVector<BufferUniquePtr> buffers_; // cache pre-packed buffers associated with the kernel
InlinedVector<size_t> buffer_sizes_; // cache sizes of pre-packed buffers (in bytes)

// Produces a hash of the buffers stored in the given instance of this class
HashValue GetHash() const;

// The function creates a copy with non-owning BufferUniquePtrs.
PrePackedWeights CreateReferringCopy() const;
};

} // namespace onnxruntime
95 changes: 95 additions & 0 deletions onnxruntime/core/framework/prepacked_weights_container.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,21 @@

#include "core/framework/prepacked_weights_container.h"
#include "core/framework/allocator_utils.h"
#include "core/graph/graph.h"

namespace onnxruntime {

PrePackedWeights PrePackedWeights::CreateReferringCopy() const {
PrePackedWeights copy;
for (const auto& prepacked_buffer : buffers_) {
// BufferDeleter is nullptr because we do not own the data in this case
copy.buffers_.emplace_back(prepacked_buffer.get(), BufferDeleter(nullptr));
}

copy.buffer_sizes_ = buffer_sizes_;
return copy;
}

AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) {
auto iter = allocators_.find(device_name);

Expand Down Expand Up @@ -49,4 +61,87 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const {
return prepacked_weights_map_.size();
}

PrepackedForSerialization::PrepackedForSerialization()
: main_graph_(nullptr, key_to_blobs_, false) {
}

PrepackedForSerialization::~PrepackedForSerialization() = default;

void PrepackedForSerialization::Subgraph::InsertFromDisk(const std::string& key, PrePackedWeights&& packed_weight) {
// We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and
// up the tree by the same kernel with the same result. The map prevents this from happening.
key_to_blobs_.emplace(key, std::move(packed_weight));
}

void PrepackedForSerialization::Subgraph::WritePacked(const std::string& weight_name, const std::string& key,
PrePackedWeights&& packed_weight) {
auto hit = key_to_blobs_.find(key);
if (hit == key_to_blobs_.end()) {
// new key
key_to_blobs_.emplace(key, std::move(packed_weight));
if (save_mode_on_) {
sorted_by_weight_for_writing_[weight_name].insert(key);
}
return;
}

// Key existed, but may or may not have a reference in this subgraph
if (save_mode_on_) {
auto& list = sorted_by_weight_for_writing_[weight_name];
list.insert(key);
}
hit->second = std::move(packed_weight);
}

const PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) const {
auto it = key_to_blobs_.find(key);
if (it == key_to_blobs_.end()) {
return nullptr;
}
return &it->second;
}

std::optional<PrePackedWeights> PrepackedForSerialization::Subgraph::ReplaceWithReferenceIfSaving(
const std::string& weight_name,
const std::string& key,
const PrePackedWeights& refer_if_absent) {
auto it = key_to_blobs_.find(key);
if (it == key_to_blobs_.end()) {
if (save_mode_on_) {
key_to_blobs_.emplace(key, refer_if_absent.CreateReferringCopy());
sorted_by_weight_for_writing_[weight_name].insert(key);
}
return std::nullopt;
}

PrePackedWeights result = std::move(it->second);
if (save_mode_on_) {
it->second = result.CreateReferringCopy();
auto& list = sorted_by_weight_for_writing_[weight_name];
list.insert(key);
} else {
key_to_blobs_.erase(it);
}
return result;
}

PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreatePrepackedGraph(const Graph& graph) {
if (graph.ParentGraph() == nullptr) {
return main_graph_;
}
auto& parent = FindOrCreatePrepackedGraph(*graph.ParentGraph());
return parent.GetOrCreateSubgraph(graph);
}

const PrepackedForSerialization::Subgraph* PrepackedForSerialization::FindPrepackedGraph(const Graph& graph) const {
if (graph.ParentGraph() == nullptr) {
return &main_graph_;
}
auto* parent = FindPrepackedGraph(*graph.ParentGraph());
if (parent != nullptr) {
parent = parent->GetSubgraph(graph);
}
return parent;
}

} // namespace onnxruntime
Loading

0 comments on commit 938bccf

Please sign in to comment.