Skip to content

Commit

Permalink
Optimize the container
Browse files Browse the repository at this point in the history
Imlement deserialization path
Start graph saving
  • Loading branch information
yuslepukhin committed Dec 4, 2024
1 parent 7b0fa40 commit df8f630
Show file tree
Hide file tree
Showing 26 changed files with 814 additions and 197 deletions.
4 changes: 3 additions & 1 deletion include/onnxruntime/core/framework/buffer_deleter.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "core/framework/allocator.h"

#include <functional>

namespace onnxruntime {

// TODO: Do we need this class or is IAllocator::MakeUniquePtr sufficient/better
Expand All @@ -31,6 +33,6 @@ class BufferDeleter {
AllocatorPtr alloc_{nullptr};
};

using BufferUniquePtr = std::unique_ptr<void, BufferDeleter>;
using BufferUniquePtr = std::unique_ptr<void, std::function<void(void*)>>;
using BufferNakedPtr = void*;
} // namespace onnxruntime
41 changes: 9 additions & 32 deletions include/onnxruntime/core/graph/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ namespace onnxruntime {
class Graph;
struct IndexedSubGraph;
class Model;
struct ModelSavingOptions;
class OpSignature;

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
Expand Down Expand Up @@ -1153,29 +1154,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
const ONNX_NAMESPACE::GraphProto& ToGraphProto();
ONNX_NAMESPACE::GraphProto ToGraphProto() const;

// Options to align external initializer offset.
// For models running on CPU, ORT will try to use mmap to load external initializers.
// To use mmap, external initializer need to be offset aligned.
// ORT saves external initializers into signle data file, each initializer is accessed with
// offset(start position of initializer) and length(byte length of initializer) of the data file.
// To use mmap, each offset need to be aligned which means offset need to divisible by
// allocation granularity(64KB for windows and 4K for other OSes).
// With align_offset to true, ORT will align offset for large initializer when
// save ONNX model with external data file.
struct OffsetAlignmentInfo {
// Offset will always be page aligned and allocation granularity aligned for mmap support.
// This is done by padding previous tensor data with zeros keeping same length.
bool align_offset = false;
// Alignment threshold for size of data.
// Having a low threshold will waste file space for small initializers.
// Only when tensor's data size is > the page_align_threshold it will be force aligned.
// Default to 1MB.
int64_t align_threshold = 1048576;
// The allocation Granularity for mmap() support.
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
int64_t allocation_granularity = 65536;
};

/** Gets the GraphProto representation of this Graph
@param external_file_path File path of the binary file to use for initializers.
@param model_file_path path of the model file.
Expand All @@ -1186,15 +1164,7 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
*/
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold,
const OffsetAlignmentInfo& align_info) const;

ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold) const {
OffsetAlignmentInfo default_options;
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
}
const ModelSavingOptions& model_saving_options) const;

/** Gets the ISchemaRegistry instances being used with this Graph. */
IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
Expand Down Expand Up @@ -1519,6 +1489,13 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
std::optional<std::string_view> new_name);

ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitiallizersImpl(const std::filesystem::path& model_path,
const std::filesystem::path& external_file_path,
const ModelSavingOptions& model_saving_options,
ONNX_NAMESPACE::GraphProto& graph_proto,
std::ostream& external_stream,
int64_t& external_offset) const;

#endif

Version IrVersion() const noexcept {
Expand Down
44 changes: 44 additions & 0 deletions include/onnxruntime/core/graph/model_saving_options.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

namespace onnxruntime {

class PrepackedForSerialization;

// These options that affect how the model initializers are saved.
// This includes options to align external initializer offset.
// For models running on CPU, ORT will try to use mmap to load external
// initializers. To use mmap, external initializer need to be offset aligned.
// ORT saves external initializers into signle data file, each initializer is
// accessed with offset(start position of initializer) and length(byte length of
// initializer) of the data file. To use mmap, each offset need to be aligned
// which means offset need to divisible by allocation granularity(64KB for
// windows and 4K for other OSes). With align_offset to true, ORT will align
// offset for large initializer when save ONNX model with external data file.
struct ModelSavingOptions {
explicit ModelSavingOptions(size_t size_threshold)
: initializer_size_threshold(size_threshold) {}

// Mimimal initializer size in bytes to be externalized on disk
size_t initializer_size_threshold;
// Offset will always be page aligned and allocation granularity aligned for
// mmap support. This is done by padding previous tensor data with zeros
// keeping same length.
bool align_offset = false;
// Alignment threshold for size of data.
// Having a low threshold will waste file space for small initializers.
// Only when tensor's data size is > the page_align_threshold it will be force
// aligned. Default to 1MB.
int64_t align_threshold = 1048576;
// The allocation Granularity for mmap() support.
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
int64_t allocation_granularity = 65536;
// Optional pointer to a container of pre-packed initializers to be
// embedded into the external initializers, so they can also be loaded
// from disk.
const PrepackedForSerialization* prepacked_for_save = nullptr;
};

}
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,15 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
"session.optimized_model_external_initializers_min_size_in_bytes";

// Use this config when save pre-packed constant initializers to an external data file.
// This allows to minimize ONNX model file size and memory map pre-packed initializers on
// model load.
// - "0": Default is not save pre-packed initializers to a data file.
// - "1": Save pre-packed constant initializers to an external data file.
// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1")
static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
"session.save_external_prepacked_constant_initializers";

// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
// "0": disable. (default)
Expand Down
5 changes: 3 additions & 2 deletions onnxruntime/core/framework/prepacked_weights.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <vector>

#include "core/common/basic_types.h"
#include "core/common/inlined_containers_fwd.h"
#include "core/framework/buffer_deleter.h"
#include "core/framework/tensor_shape.h"

Expand All @@ -16,8 +17,8 @@ struct PrePackedWeights final {
// Hence we hold them in container. It is upto the developer implementing each PrePack()
// method to define what gets stored in which position of the container.

std::vector<IAllocatorUniquePtr<void>> buffers_; // cache pre-packed buffers associated with the kernel
std::vector<size_t> buffer_sizes_; // cache sizes of pre-packed buffers (in bytes)
InlinedVector<BufferUniquePtr> buffers_; // cache pre-packed buffers associated with the kernel
InlinedVector<size_t> buffer_sizes_; // cache sizes of pre-packed buffers (in bytes)

// Produces a hash of the buffers stored in the given instance of this class
HashValue GetHash() const;
Expand Down
55 changes: 55 additions & 0 deletions onnxruntime/core/framework/prepacked_weights_container.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "core/framework/prepacked_weights_container.h"
#include "core/framework/allocator_utils.h"
#include "core/graph/graph.h"

namespace onnxruntime {

Expand Down Expand Up @@ -49,4 +50,58 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const {
return prepacked_weights_map_.size();
}

PrepackedForSerialization::PrepackedForSerialization()
: main_graph_(nullptr, key_to_blobs_, false) {
}

PrepackedForSerialization::~PrepackedForSerialization() = default;

void PrepackedForSerialization::Subgraph::Insert(std::string key, PrePackedWeights&& packed_weight) {
auto result = key_to_blobs_.emplace(std::move(key), std::move(packed_weight));
ORT_ENFORCE(result.second, "Duplicate pre-packed weight from disk");
}

bool PrepackedForSerialization::Subgraph::CreateOrOverWrite(const std::string& weight_name, std::string key,
PrePackedWeights&& packed_weight) {
// We overwrite the existing key. This is necessary in case we already have a pre-packed weight
// mapped from disk, but we want to overwrite it with our most recent pre-packed version.
auto result = key_to_blobs_.insert_or_assign(std::move(key), std::move(packed_weight));
weight_to_pre_packs_[weight_name].push_back(result.first);
return result.second;
}

const PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) const {
auto it = key_to_blobs_.find(key);
if (it == key_to_blobs_.end()) {
return nullptr;
}
return &it->second;
}

PrePackedWeights* PrepackedForSerialization::Subgraph::GetPrepackedWeights(const std::string& key) {
auto it = key_to_blobs_.find(key);
if (it == key_to_blobs_.end()) {
return nullptr;
}
return &it->second;
}

std::optional<PrePackedWeights> PrepackedForSerialization::TakePrepackedWeights(const std::string& key) {
auto it = key_to_blobs_.find(key);
if (it == key_to_blobs_.end()) {
return std::nullopt;
}
PrePackedWeights result = std::move(it->second);
key_to_blobs_.erase(it);
return result;
}

PrepackedForSerialization::Subgraph& PrepackedForSerialization::FindOrCreateSubgraph(const Graph& graph) {
if (graph.ParentGraph() == nullptr) {
return main_graph_;
}
auto& parent = FindOrCreateSubgraph(*graph.ParentGraph());
return parent.GetOrCreateSubgraph(graph);
}

} // namespace onnxruntime
138 changes: 130 additions & 8 deletions onnxruntime/core/framework/prepacked_weights_container.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@

#pragma once

#include <unordered_map>
#include <unordered_set>
#include <string>
#include <cstdint>

#include "core/framework/buffer_deleter.h"

#include "core/common/common.h"
#include "core/framework/allocator.h"
#include <mutex>
#include "prepacked_weights.h"

#include <cstdint>
#include <mutex>
#include <optional>
#include <string>
#include <tuple>
#include <unordered_map>

namespace onnxruntime {

#ifndef SHARED_PROVIDER
class Graph;
#else
struct Graph;
#endif

class PrepackedWeightsContainer final {
public:
PrepackedWeightsContainer() {
Expand Down Expand Up @@ -66,4 +72,120 @@ class PrepackedWeightsContainer final {
std::unordered_map<std::string, PrePackedWeights> prepacked_weights_map_;
};

/// <summary>
/// This class has a dual purpose.
/// When saving to disk is ON (IsOverWriteForSave() true)
/// it provides a storage container for PrePackedWeights instances. The pre-packed
/// data is collected using PrepackConstaitInitializers(). In this case newly pre-pack
/// data is used for writing to disk, unless old data matches.
///
/// If saving is OFF, it is used to contain the weights memory mapped from disk.
/// Those weights are then moved to the shared container if weight sharing is enabled.
/// And also the interested kernels.
/// </summary>
class PrepackedForSerialization final {
public:
explicit PrepackedForSerialization();
~PrepackedForSerialization();

ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedForSerialization);

using KeyToBlobMap = std::unordered_map<std::string, PrePackedWeights>;
using KeyToBlobMapIterator = KeyToBlobMap::iterator;
using BlobsInderect = std::vector<KeyToBlobMapIterator>;
using BlobsConstIterator = BlobsInderect::const_iterator;

// Maps weight name to iterators in key_to_blobs_. It associates a weight name with its pre-packs.
// Normally, a single weight produces a single PrePackedWeights. But it is possible that a weight
// is pre-packed by different kernels.
using WeightToPrePacksMap = std::unordered_map<std::string, BlobsInderect>;

class Subgraph {
public:
Subgraph(Subgraph* par, KeyToBlobMap& key_blobs, bool overwrite_for_save)
: save_mode_on_(overwrite_for_save), parent_(par), key_to_blobs_(key_blobs) {
}

ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Subgraph);

Subgraph* Parent() noexcept {
return parent_;
}

Subgraph& GetOrCreateSubgraph(const Graph& graph) {
auto result = subgraph_prepacks_.emplace(&graph, nullptr);
if (result.second) {
result.first->second = std::make_unique<Subgraph>(this, key_to_blobs_, save_mode_on_);
}
return *result.first->second;
}

const Subgraph* GetSubgraph(const Graph& graph) const {
auto it = subgraph_prepacks_.find(&graph);
return it == subgraph_prepacks_.end() ? nullptr : it->second.get();
}

Subgraph* GetSubgraph(const Graph& graph) {
auto it = subgraph_prepacks_.find(&graph);
return it == subgraph_prepacks_.end() ? nullptr : it->second.get();
}

// This does not populate per-initializer structures.
void Insert(std::string key, PrePackedWeights&& packed_weight);

bool CreateOrOverWrite(const std::string& weight_name, std::string key,
PrePackedWeights&& packed_weight);

const PrePackedWeights* GetPrepackedWeights(const std::string& key) const;

PrePackedWeights* GetPrepackedWeights(const std::string& key);

bool IsSaveModeOn() const noexcept {
return save_mode_on_;
}

void SetSaveMode(bool value) noexcept {
save_mode_on_ = value;
}

private:
bool save_mode_on_;
Subgraph* parent_ = nullptr;
KeyToBlobMap& key_to_blobs_;
WeightToPrePacksMap weight_to_pre_packs_;
// Map Graph ptr to subgraphs
std::unordered_map<const Graph*, std::unique_ptr<Subgraph>> subgraph_prepacks_;
};

const Subgraph& MainGraph() const noexcept {
return main_graph_;
}

Subgraph& MainGraph() noexcept {
return main_graph_;
}

size_t GetNumberOfKeyedBlobs() const noexcept {
return key_to_blobs_.size();
}

void SetSaveMode(bool value) noexcept {
main_graph_.SetSaveMode(value);
}

bool IsSaveModeOn() const noexcept {
return main_graph_.IsSaveModeOn();
}

std::optional<PrePackedWeights> TakePrepackedWeights(const std::string& key);

Subgraph& FindOrCreateSubgraph(const Graph& graph);

private:
// Map of key to pre-packed blobs.This is common for all subgraphs
// The key is : op_type + "+" + hash_of_prepacked_buffers_in_the_PrepackedWeights_instance.
// as defined above. We store keys for all scopes (main graph and subgraphs)
KeyToBlobMap key_to_blobs_;
Subgraph main_graph_;
};
} // namespace onnxruntime
Loading

0 comments on commit df8f630

Please sign in to comment.