From 8cfb9542f3629a5cda58518145754f4fec1a1ff3 Mon Sep 17 00:00:00 2001
From: MirceaDan99 <mircea-aurelian.dan@intel.com>
Date: Wed, 11 Dec 2024 10:37:13 +0200
Subject: [PATCH] Use alternative from `PR #27981` instead for memory mapped
 buffers

---
 .../openvino/runtime/aligned_buffer.hpp       |  8 +-
 .../openvino/runtime/shared_buffer.hpp        | 21 -----
 .../openvino/runtime/compilation_context.hpp  |  7 +-
 .../dev_api/openvino/runtime/iplugin.hpp      | 27 ------
 src/inference/src/cache_manager.hpp           |  1 -
 src/inference/src/dev/compilation_context.cpp |  7 +-
 src/inference/src/dev/core_impl.cpp           |  2 +-
 src/inference/src/dev/iplugin.cpp             | 13 ---
 src/inference/src/dev/plugin.cpp              | 13 ---
 src/inference/src/dev/plugin.hpp              | 10 +-
 src/plugins/intel_cpu/src/plugin.cpp          | 28 ++----
 src/plugins/intel_cpu/src/plugin.h            | 16 +---
 src/plugins/intel_cpu/src/utils/serialize.cpp | 70 ++++++--------
 src/plugins/intel_cpu/src/utils/serialize.hpp |  8 +-
 .../intel_npu/common/icompiler_adapter.hpp    |  2 +-
 .../include/intel_npu/common/igraph.hpp       |  8 +-
 .../common/include/intel_npu/common/npu.hpp   | 10 --
 .../intel_npu/src/common/src/igraph.cpp       |  6 +-
 .../src/compiler_adapter/src/driver_graph.cpp |  4 +-
 .../src/compiler_adapter/src/plugin_graph.cpp | 14 +--
 .../intel_npu/src/plugin/include/plugin.hpp   |  9 --
 .../intel_npu/src/plugin/src/plugin.cpp       | 94 +++++--------------
 22 files changed, 95 insertions(+), 283 deletions(-)
diff --git a/src/core/dev_api/openvino/runtime/aligned_buffer.hpp b/src/core/dev_api/openvino/runtime/aligned_buffer.hpp
index a7cf78ae9ee658..904e30999d10df 100644
--- a/src/core/dev_api/openvino/runtime/aligned_buffer.hpp
+++ b/src/core/dev_api/openvino/runtime/aligned_buffer.hpp
@@ -30,17 +30,14 @@ class OPENVINO_API AlignedBuffer {
     size_t size() const {
         return m_byte_size;
     }
-    void updateOffset(size_t offset) {
-        m_offset = offset;
-    }
     void* get_ptr(size_t offset) const {
         return m_aligned_buffer + offset;
     }
     void* get_ptr() {
-        return m_aligned_buffer + m_offset;
+        return m_aligned_buffer;
     }
     const void* get_ptr() const {
-        return m_aligned_buffer + m_offset;
+        return m_aligned_buffer;
     }
     template <typename T>
     T* get_ptr() {
@@ -64,7 +61,6 @@ class OPENVINO_API AlignedBuffer {
     char* m_allocated_buffer;
     char* m_aligned_buffer;
     size_t m_byte_size;
-    size_t m_offset = 0;
 };
 
 template <>
diff --git a/src/core/dev_api/openvino/runtime/shared_buffer.hpp b/src/core/dev_api/openvino/runtime/shared_buffer.hpp
index cdfe58f0741e1e..2c784ef6081c35 100644
--- a/src/core/dev_api/openvino/runtime/shared_buffer.hpp
+++ b/src/core/dev_api/openvino/runtime/shared_buffer.hpp
@@ -16,7 +16,6 @@ class SharedBuffer : public ov::AlignedBuffer {
         m_allocated_buffer = data;
         m_aligned_buffer = data;
         m_byte_size = size;
-        m_offset = 0;
     }
 
     virtual ~SharedBuffer() {
@@ -82,26 +81,6 @@ class OwningSharedStreamBuffer : public SharedStreamBuffer {
         return m_shared_obj;
     }
 
-    std::streamsize xsgetn(char* s, std::streamsize count) override {
-        auto streamSize = SharedStreamBuffer::xsgetn(s, count);
-        m_shared_obj->updateOffset(m_offset);
-        return streamSize;
-    }
-
-    int_type uflow() override {
-        auto val = SharedStreamBuffer::uflow();
-        m_shared_obj->updateOffset(m_offset);
-        return val;
-    }
-
-    pos_type seekoff(off_type off,
-                     std::ios_base::seekdir dir,
-                     std::ios_base::openmode which = std::ios_base::in) override {
-        auto pos = SharedStreamBuffer::seekoff(off, dir, which);
-        m_shared_obj->updateOffset(m_offset);
-        return pos;
-    }
-
 protected:
     std::shared_ptr<ov::AlignedBuffer> m_shared_obj;
 };
diff --git a/src/inference/dev_api/openvino/runtime/compilation_context.hpp b/src/inference/dev_api/openvino/runtime/compilation_context.hpp
index 033797c9d0d811..ba3a2aa8d64ded 100644
--- a/src/inference/dev_api/openvino/runtime/compilation_context.hpp
+++ b/src/inference/dev_api/openvino/runtime/compilation_context.hpp
@@ -32,10 +32,9 @@ class CompiledBlobHeader final {
     std::string m_ieVersion;
     std::string m_fileInfo;
     std::string m_runtimeInfo;
-    std::shared_ptr<ov::AlignedBuffer> m_model_buffer;
 
 public:
-    CompiledBlobHeader(std::shared_ptr<ov::AlignedBuffer> model_buffer);
+    CompiledBlobHeader();
     CompiledBlobHeader(const std::string& ieVersion, const std::string& fileInfo, const std::string& runtimeInfo);
 
     const std::string& get_openvino_version() const {
@@ -50,10 +49,6 @@ class CompiledBlobHeader final {
         return m_runtimeInfo;
     }
 
-    const std::shared_ptr<ov::AlignedBuffer> get_model_buffer() const {
-        return m_model_buffer;
-    }
-
     friend std::istream& operator>>(std::istream& stream, CompiledBlobHeader& header);
 
     friend std::ostream& operator<<(std::ostream& stream, const CompiledBlobHeader& header);
diff --git a/src/inference/dev_api/openvino/runtime/iplugin.hpp b/src/inference/dev_api/openvino/runtime/iplugin.hpp
index e88c3e4a539d15..8165e658c206f0 100644
--- a/src/inference/dev_api/openvino/runtime/iplugin.hpp
+++ b/src/inference/dev_api/openvino/runtime/iplugin.hpp
@@ -185,33 +185,6 @@ class OPENVINO_RUNTIME_API IPlugin : public std::enable_shared_from_this<IPlugin
                                                              const ov::SoPtr<ov::IRemoteContext>& context,
                                                              const ov::AnyMap& properties) const = 0;
 
-    /**
-     * @brief Creates an compiled model from an previously exported model using plugin implementation
-     *        and removes OpenVINO Runtime magic and plugin name
-     * @param model Reference to model output stream
-     * @param weights_buffer AlignedBuffer with cached model
-     * @param properties A ov::AnyMap of properties
-     * @return An Compiled model
-     */
-    virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
-                                                             std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                             const ov::AnyMap& properties) const;
-
-    /**
-     * @brief Creates an compiled model from an previously exported model using plugin implementation
-     *        and removes OpenVINO Runtime magic and plugin name
-     * @param model Reference to model output stream
-     * @param weights_buffer AlignedBuffer with cached model
-     * @param context A pointer to plugin context derived from RemoteContext class used to
-     *        execute the network
-     * @param properties A ov::AnyMap of properties
-     * @return An Compiled model
-     */
-    virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
-                                                             std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                             const ov::SoPtr<ov::IRemoteContext>& context,
-                                                             const ov::AnyMap& properties) const;
-
     /**
      * @brief Queries a plugin about supported layers in model
      * @param model Model object to query.
diff --git a/src/inference/src/cache_manager.hpp b/src/inference/src/cache_manager.hpp
index 2cf0bca01b17e7..82813e5dd4788f 100644
--- a/src/inference/src/cache_manager.hpp
+++ b/src/inference/src/cache_manager.hpp
@@ -141,7 +141,6 @@ class FileStorageCacheManager final : public ICacheManager {
                 auto mmap = ov::load_mmap_object(blob_file_name);
                 auto shared_buffer =
                     std::make_shared<ov::SharedBuffer<std::shared_ptr<MappedMemory>>>(mmap->data(), mmap->size(), mmap);
-#if 0
                 OwningSharedStreamBuffer buf(shared_buffer);
                 std::istream stream(&buf);
                 reader(stream, shared_buffer);
diff --git a/src/inference/src/dev/compilation_context.cpp b/src/inference/src/dev/compilation_context.cpp
index 34f7156190f231..c8eac0d22af35b 100644
--- a/src/inference/src/dev/compilation_context.cpp
+++ b/src/inference/src/dev/compilation_context.cpp
@@ -156,8 +156,7 @@ std::string ModelCache::compute_hash(const std::string& modelStr,
 
 //////////////////////////////////////////////////
 
-CompiledBlobHeader::CompiledBlobHeader(std::shared_ptr<ov::AlignedBuffer> model_buffer)
-    : m_model_buffer(model_buffer) {}
+CompiledBlobHeader::CompiledBlobHeader() {}
 
 CompiledBlobHeader::CompiledBlobHeader(const std::string& ieVersion,
                                        const std::string& fileInfo,
@@ -169,10 +168,6 @@ CompiledBlobHeader::CompiledBlobHeader(const std::string& ieVersion,
 std::istream& operator>>(std::istream& stream, CompiledBlobHeader& header) {
     std::string xmlStr;
     std::getline(stream, xmlStr);
-    auto model_buffer = header.get_model_buffer();
-    if (model_buffer != nullptr) {
-        model_buffer->updateOffset(stream.tellg());
-    }
 
     pugi::xml_document document;
     pugi::xml_parse_result res = document.load_string(xmlStr.c_str());
diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp
index 02f48cdbcb0e09..673f6fd569a11e 100644
--- a/src/inference/src/dev/core_impl.cpp
+++ b/src/inference/src/dev/core_impl.cpp
@@ -1418,7 +1418,7 @@ ov::SoPtr<ov::ICompiledModel> ov::CoreImpl::load_model_from_cache(
                              ov::itt::domains::LoadTime,
                              "Core::load_model_from_cache::ReadStreamAndImport");
                 try {
-                    ov::CompiledBlobHeader header(model_buffer);
+                    ov::CompiledBlobHeader header;
                     networkStream >> header;
                     if (header.get_file_info() != ov::ModelCache::calculate_file_info(cacheContent.modelPath)) {
                         // Original file is changed, don't use cache
diff --git a/src/inference/src/dev/iplugin.cpp b/src/inference/src/dev/iplugin.cpp
index 1e1b70af861b58..1049e39bee6f49 100644
--- a/src/inference/src/dev/iplugin.cpp
+++ b/src/inference/src/dev/iplugin.cpp
@@ -57,19 +57,6 @@ const std::string& ov::IPlugin::get_device_name() const {
     return m_plugin_name;
 }
 
-std::shared_ptr<ov::ICompiledModel> ov::IPlugin::import_model(std::istream& model,
-                                                              std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                              const ov::AnyMap& properties) const {
-    OPENVINO_THROW_NOT_IMPLEMENTED("This method is not implemented");
-}
-
-std::shared_ptr<ov::ICompiledModel> ov::IPlugin::import_model(std::istream& model,
-                                                              std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                              const ov::SoPtr<ov::IRemoteContext>& context,
-                                                              const ov::AnyMap& properties) const {
-    OPENVINO_THROW_NOT_IMPLEMENTED("This method is not implemented");
-}
-
 void ov::IPlugin::set_core(const std::weak_ptr<ov::ICore>& core) {
     OPENVINO_ASSERT(!core.expired());
     m_core = core;
diff --git a/src/inference/src/dev/plugin.cpp b/src/inference/src/dev/plugin.cpp
index 605dc94e0ef487..40207bac9087fa 100644
--- a/src/inference/src/dev/plugin.cpp
+++ b/src/inference/src/dev/plugin.cpp
@@ -79,19 +79,6 @@ ov::SoPtr<ov::ICompiledModel> ov::Plugin::import_model(std::istream& model,
     OV_PLUGIN_CALL_STATEMENT(return {m_ptr->import_model(model, context, config), m_so});
 }
 
-ov::SoPtr<ov::ICompiledModel> ov::Plugin::import_model(std::istream& model,
-                                                       std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                       const ov::AnyMap& properties) const {
-    OV_PLUGIN_CALL_STATEMENT(return {m_ptr->import_model(model, model_buffer, properties), m_so});
-}
-
-ov::SoPtr<ov::ICompiledModel> ov::Plugin::import_model(std::istream& model,
-                                                       std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                       const ov::SoPtr<ov::IRemoteContext>& context,
-                                                       const ov::AnyMap& config) const {
-    OV_PLUGIN_CALL_STATEMENT(return {m_ptr->import_model(model, model_buffer, context, config), m_so});
-}
-
 ov::SoPtr<ov::IRemoteContext> ov::Plugin::create_context(const AnyMap& params) const {
     OV_PLUGIN_CALL_STATEMENT({
         auto remote = m_ptr->create_context(params);
diff --git a/src/inference/src/dev/plugin.hpp b/src/inference/src/dev/plugin.hpp
index bdc84737456aec..14a5adebbab3a4 100644
--- a/src/inference/src/dev/plugin.hpp
+++ b/src/inference/src/dev/plugin.hpp
@@ -59,15 +59,6 @@ class Plugin {
                                            const ov::SoPtr<ov::IRemoteContext>& context,
                                            const ov::AnyMap& config) const;
 
-    SoPtr<ov::ICompiledModel> import_model(std::istream& model,
-                                           std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                           const ov::AnyMap& properties) const;
-
-    SoPtr<ov::ICompiledModel> import_model(std::istream& model,
-                                           std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                           const ov::SoPtr<ov::IRemoteContext>& context,
-                                           const ov::AnyMap& config) const;
-
     ov::SoPtr<ov::IRemoteContext> create_context(const AnyMap& params) const;
 
     ov::SoPtr<ov::IRemoteContext> get_default_context(const AnyMap& params) const;
@@ -87,3 +78,4 @@ class Plugin {
 };
 
 }  // namespace ov
+
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index b30f130c2a2a1f..33cb87b337bfef 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -7,7 +7,6 @@
 #include "cpu_streams_calculation.hpp"
 #include "internal_properties.hpp"
 #include "itt.h"
-#include "openvino/op/paged_attention.hpp"
 #include "openvino/runtime/intel_cpu/properties.hpp"
 #include "openvino/runtime/internal_properties.hpp"
 #include "openvino/runtime/properties.hpp"
@@ -20,6 +19,7 @@
 #include "utils/precision_support.h"
 #include "utils/serialize.hpp"
 #include "weights_cache.hpp"
+#include "openvino/op/paged_attention.hpp"
 
 #if defined(__linux__)
 #    include <signal.h>
@@ -200,7 +200,7 @@ static Config::ModelType getModelType(const std::shared_ptr<const Model>& model)
         return Config::ModelType::CNN;
 
     if ((op::util::has_op_with_type<op::v13::ScaledDotProductAttention>(model) && model->get_variables().size() > 0) ||
-        op::util::has_op_with_type<ov::op::PagedAttentionExtension>(model))
+         op::util::has_op_with_type<ov::op::PagedAttentionExtension>(model))
         return Config::ModelType::LLM;
 
     return Config::ModelType::Unknown;
@@ -446,17 +446,15 @@ ov::Any Plugin::get_ro_property(const std::string& name, const ov::AnyMap& optio
 
         return decltype(ov::supported_properties)::value_type(std::move(supportedProperties));
     } else if (ov::internal::supported_properties == name) {
-        return decltype(ov::internal::supported_properties)::value_type {
+        return decltype(ov::internal::supported_properties)::value_type{
             ov::PropertyName{ov::internal::caching_properties.name(), ov::PropertyMutability::RO},
 #if !defined(OPENVINO_ARCH_ARM) && !(defined(__APPLE__) || defined(__MACOSX))
-                ov::PropertyName{ov::internal::caching_with_mmap.name(), ov::PropertyMutability::RO},
+            ov::PropertyName{ov::internal::caching_with_mmap.name(), ov::PropertyMutability::RO},
 #endif
-                ov::PropertyName{ov::internal::exclusive_async_requests.name(), ov::PropertyMutability::RW},
-                ov::PropertyName{ov::internal::compiled_model_runtime_properties.name(), ov::PropertyMutability::RO},
-                ov::PropertyName {
-                ov::internal::compiled_model_runtime_properties_supported.name(), ov::PropertyMutability::RO
-            }
-        };
+            ov::PropertyName{ov::internal::exclusive_async_requests.name(), ov::PropertyMutability::RW},
+            ov::PropertyName{ov::internal::compiled_model_runtime_properties.name(), ov::PropertyMutability::RO},
+            ov::PropertyName{ov::internal::compiled_model_runtime_properties_supported.name(),
+                             ov::PropertyMutability::RO}};
     } else if (name == ov::device::full_name) {
         return decltype(ov::device::full_name)::value_type(deviceFullName);
     } else if (name == ov::available_devices) {
@@ -555,16 +553,11 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>&
     return res;
 }
 
-std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& model_stream, const ov::AnyMap& config) const {
-    return import_model(model_stream, nullptr, config);
-}
-
 std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& model_stream,
-                                                         std::shared_ptr<ov::AlignedBuffer> model_buffer,
                                                          const ov::AnyMap& config) const {
     OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "import_model");
 
-    CacheDecrypt decrypt{codec_xor};
+    CacheDecrypt decrypt{ codec_xor };
     bool decript_from_string = false;
     if (config.count(ov::cache_encryption_callbacks.name())) {
         auto encryption_callbacks = config.at(ov::cache_encryption_callbacks.name()).as<EncryptionCallbacks>();
@@ -585,8 +578,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& model_str
         [this](const std::shared_ptr<ov::AlignedBuffer>& model, const std::shared_ptr<ov::AlignedBuffer>& weights) {
             return get_core()->read_model(model, weights);
         },
-        decrypt,
-        decript_from_string);
+        decrypt, decript_from_string);
 
     std::shared_ptr<ov::Model> model;
     deserializer >> model;
diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h
index c7f1dee9fb52c6..8973478d30403f 100644
--- a/src/plugins/intel_cpu/src/plugin.h
+++ b/src/plugins/intel_cpu/src/plugin.h
@@ -20,7 +20,8 @@ class Plugin : public ov::IPlugin {
     std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
                                                       const ov::AnyMap& properties,
                                                       const ov::SoPtr<ov::IRemoteContext>& context) const override {
-        OPENVINO_THROW_NOT_IMPLEMENTED("compile_model with RemoteContext is not supported by CPU plugin!");
+        OPENVINO_THROW_NOT_IMPLEMENTED(
+            "compile_model with RemoteContext is not supported by CPU plugin!");
     };
 
     void set_property(const ov::AnyMap& properties) override;
@@ -29,17 +30,8 @@ class Plugin : public ov::IPlugin {
     std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
                                                      const ov::SoPtr<ov::IRemoteContext>& context,
                                                      const ov::AnyMap& properties) const override {
-        OPENVINO_THROW_NOT_IMPLEMENTED("import_model with RemoteContext is not supported by CPU plugin!");
-    };
-
-    std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
-                                                     std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                     const ov::AnyMap& properties) const override;
-    std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
-                                                     std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                     const ov::SoPtr<ov::IRemoteContext>& context,
-                                                     const ov::AnyMap& properties) const override {
-        OPENVINO_THROW_NOT_IMPLEMENTED("import_model with RemoteContext is not supported by CPU plugin!");
+        OPENVINO_THROW_NOT_IMPLEMENTED(
+            "import_model with RemoteContext is not supported by CPU plugin!");
     };
 
     ov::SupportedOpsMap query_model(const std::shared_ptr<const ov::Model>& model,
diff --git a/src/plugins/intel_cpu/src/utils/serialize.cpp b/src/plugins/intel_cpu/src/utils/serialize.cpp
index 55b53116e4ac01..33d8140fbe4a84 100644
--- a/src/plugins/intel_cpu/src/utils/serialize.cpp
+++ b/src/plugins/intel_cpu/src/utils/serialize.cpp
@@ -14,8 +14,7 @@ namespace intel_cpu {
 ////////// ModelSerializer //////////
 
 ModelSerializer::ModelSerializer(std::ostream& ostream, CacheEncrypt encrypt_fn)
-    : m_ostream(ostream),
-      m_cache_encrypt(std::move(encrypt_fn)) {}
+    : m_ostream(ostream), m_cache_encrypt(std::move(encrypt_fn)) {}
 
 void ModelSerializer::operator<<(const std::shared_ptr<ov::Model>& model) {
     auto serialize_info = [&](std::ostream& stream) {
@@ -36,25 +35,22 @@ ModelDeserializer::ModelDeserializer(std::istream& model_stream,
                                      ModelBuilder fn,
                                      const CacheDecrypt& decrypt_fn,
                                      bool decript_from_string)
-    : m_istream(model_stream),
-      m_model_builder(std::move(fn)),
-      m_decript_from_string(decript_from_string),
-      m_model_buffer(model_buffer) {
-    if (m_decript_from_string) {
-        m_cache_decrypt.m_decrypt_str = decrypt_fn.m_decrypt_str;
-    } else {
-        m_cache_decrypt.m_decrypt_char = decrypt_fn.m_decrypt_char;
+    : m_istream(model_stream), m_model_builder(std::move(fn)), m_decript_from_string(decript_from_string), m_model_buffer(model_buffer) {
+        if (m_decript_from_string) {
+            m_cache_decrypt.m_decrypt_str = decrypt_fn.m_decrypt_str;
+        } else {
+            m_cache_decrypt.m_decrypt_char = decrypt_fn.m_decrypt_char;
+        }
     }
-}
 
-void ModelDeserializer::set_info(pugi::xml_node& root, std::shared_ptr<ov::Model>& model) {}
+    void ModelDeserializer::set_info(pugi::xml_node& root, std::shared_ptr<ov::Model>& model) {}
 
-void ModelDeserializer::operator>>(std::shared_ptr<ov::Model>& model) {
-    if (m_model_buffer) {
-        process_mmap(model, m_model_buffer);
-    } else {
-        process_stream(model);
-    }
+    void ModelDeserializer::operator>>(std::shared_ptr<ov::Model>& model) {
+        if (m_model_buffer) {
+            process_mmap(model, m_model_buffer);
+        } else {
+            process_stream(model);
+        }
 }
 
 void ModelDeserializer::process_mmap(std::shared_ptr<ov::Model>& model,
@@ -81,10 +77,7 @@ void ModelDeserializer::process_mmap(std::shared_ptr<ov::Model>& model,
     // Read model input/output precisions.
     pugi::xml_document xml_in_out_doc;
     if (hdr.custom_data_size > 0lu) {
-        auto res = xml_in_out_doc.load_buffer(buffer_base + hdr.custom_data_offset,
-                                              hdr.custom_data_size,
-                                              pugi::parse_default,
-                                              pugi::encoding_utf8);
+        auto res = xml_in_out_doc.load_buffer(buffer_base + hdr.custom_data_offset, hdr.custom_data_size, pugi::parse_default, pugi::encoding_utf8);
         if (res.status != pugi::status_ok) {
             OPENVINO_THROW("[CPU] Could to deserialize custom data.");
         }
@@ -93,10 +86,7 @@ void ModelDeserializer::process_mmap(std::shared_ptr<ov::Model>& model,
     // Map blob content
     std::shared_ptr<ov::AlignedBuffer> weights_buf;
     if (hdr.consts_size) {
-        weights_buf =
-            std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::AlignedBuffer>>>(buffer_base + hdr.consts_offset,
-                                                                                   hdr.consts_size,
-                                                                                   mmemory);
+        weights_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::AlignedBuffer>>>(buffer_base + hdr.consts_offset, hdr.consts_size, mmemory);
     }
 
     // XML content
@@ -113,7 +103,9 @@ void ModelDeserializer::process_mmap(std::shared_ptr<ov::Model>& model,
         xml_buff->assign(buffer_base + hdr.model_offset, hdr.model_size);
     }
     std::shared_ptr<ov::AlignedBuffer> model_buf =
-        std::make_shared<ov::SharedBuffer<std::shared_ptr<std::string>>>(&((*xml_buff)[0]), hdr.model_size, xml_buff);
+            std::make_shared<ov::SharedBuffer<std::shared_ptr<std::string>>>(&((*xml_buff)[0]),
+                                                                             hdr.model_size,
+                                                                             xml_buff);
 
     model = m_model_builder(model_buf, weights_buf);
 
@@ -158,7 +150,7 @@ void ModelDeserializer::process_stream(std::shared_ptr<ov::Model>& model) {
     auto data_blob = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape({hdr.consts_size}));
     m_istream.seekg(hdr.consts_offset);
     if (hdr.consts_size) {
-        m_istream.read(static_cast<char*>(data_blob->data(ov::element::u8)), hdr.consts_size);
+        m_istream.read(static_cast<char *>(data_blob->data(ov::element::u8)), hdr.consts_size);
     }
 
     // read XML content
@@ -170,20 +162,16 @@ void ModelDeserializer::process_stream(std::shared_ptr<ov::Model>& model) {
         if (m_decript_from_string) {
             *xml_string = m_cache_decrypt.m_decrypt_str(*xml_string);
         } else {
-            m_cache_decrypt.m_decrypt_char(const_cast<char*>(xml_string->data()),
-                                           xml_string->data(),
-                                           xml_string->size());
+            m_cache_decrypt.m_decrypt_char(const_cast<char*>(xml_string->data()), xml_string->data(), xml_string->size());
         }
     }
 
-    auto model_buf =
-        std::make_shared<ov::SharedBuffer<std::shared_ptr<std::string>>>(const_cast<char*>(xml_string->data()),
-                                                                         xml_string->size(),
-                                                                         xml_string);
-    auto weights_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::Tensor>>>(
-        reinterpret_cast<char*>(data_blob->data(ov::element::u8)),
-        hdr.consts_size,
-        data_blob);
+    auto model_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<std::string>>>(const_cast<char*>(xml_string->data()),
+                                                                                      xml_string->size(),
+                                                                                      xml_string);
+    auto weights_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::Tensor>>>(reinterpret_cast<char*>(data_blob->data(ov::element::u8)),
+                                                                                       hdr.consts_size,
+                                                                                       data_blob);
 
     model = m_model_builder(model_buf, weights_buf);
 
@@ -192,5 +180,5 @@ void ModelDeserializer::process_stream(std::shared_ptr<ov::Model>& model) {
     set_info(root, model);
 }
 
-}  // namespace intel_cpu
-}  // namespace ov
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/utils/serialize.hpp b/src/plugins/intel_cpu/src/utils/serialize.hpp
index 0821b1160c38d7..4dfdd6b22afbd4 100644
--- a/src/plugins/intel_cpu/src/utils/serialize.hpp
+++ b/src/plugins/intel_cpu/src/utils/serialize.hpp
@@ -29,9 +29,7 @@ class ModelSerializer {
 
 class ModelDeserializer {
 public:
-    typedef std::function<std::shared_ptr<ov::Model>(const std::shared_ptr<ov::AlignedBuffer>&,
-                                                     const std::shared_ptr<ov::AlignedBuffer>&)>
-        ModelBuilder;
+    typedef std::function<std::shared_ptr<ov::Model>(const std::shared_ptr<ov::AlignedBuffer>&, const std::shared_ptr<ov::AlignedBuffer>&)> ModelBuilder;
 
     ModelDeserializer(std::istream& model,
                       std::shared_ptr<ov::AlignedBuffer> model_buffer,
@@ -57,5 +55,5 @@ class ModelDeserializer {
     std::shared_ptr<ov::AlignedBuffer> m_model_buffer;
 };
 
-}  // namespace intel_cpu
-}  // namespace ov
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp
index 6e585299d68a1d..375ab305db57fc 100644
--- a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp
+++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp
@@ -12,7 +12,7 @@ class ICompilerAdapter {
 public:
     virtual std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model,
                                             const Config& config) const = 0;
-    virtual std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const = 0;
+    virtual std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const = 0;
     virtual ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const = 0;
 
     virtual ~ICompilerAdapter() = default;
diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
index 365cc35727cf4c..54155887b23972 100644
--- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
+++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
@@ -58,10 +58,11 @@ class BlobContainerVector : public BlobContainer {
 
 class BlobContainerAlignedBuffer : public BlobContainer {
 public:
-    BlobContainerAlignedBuffer(const std::shared_ptr<ov::AlignedBuffer>& blobSO) : _ownershipBlob(blobSO) {}
+    BlobContainerAlignedBuffer(const std::shared_ptr<ov::AlignedBuffer>& blobSO,
+                                size_t offset) : _ownershipBlob(blobSO), _offset(offset) {}
 
     void* get_ptr() override {
-        return _ownershipBlob->get_ptr();
+        return _ownershipBlob->get_ptr(_offset);
     }
 
     size_t size() const override {
@@ -74,6 +75,7 @@ class BlobContainerAlignedBuffer : public BlobContainer {
 
 private:
     std::shared_ptr<ov::AlignedBuffer> _ownershipBlob;
+    size_t _offset;
 };
 
 class IGraph : public std::enable_shared_from_this<IGraph> {
@@ -149,7 +151,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
     // first inference starts running
     std::mutex _mutex;
 
-    std::unique_ptr<BlobContainer> _blob;
+    std::unique_ptr<BlobContainer> _blobPtr;
 
     uint32_t _unique_id = 0;
     uint32_t _last_submitted_id;
diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp
index 168b57e30945ee..9e4c59852151ce 100644
--- a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp
+++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp
@@ -57,16 +57,6 @@ class IEngineBackend : public std::enable_shared_from_this<IEngineBackend> {
 
 //------------------------------------------------------------------------------
 
-class ICompilerAdapter {
-public:
-    virtual std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model,
-                                            const Config& config) const = 0;
-    virtual std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const = 0;
-    virtual ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const = 0;
-
-    virtual ~ICompilerAdapter() = default;
-};
-
 //------------------------------------------------------------------------------
 
 class IDevice : public std::enable_shared_from_this<IDevice> {
diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp
index 8bd26367cd0f1f..3f04bb0ce8e5ff 100644
--- a/src/plugins/intel_npu/src/common/src/igraph.cpp
+++ b/src/plugins/intel_npu/src/common/src/igraph.cpp
@@ -17,12 +17,12 @@ namespace intel_npu {
 IGraph::IGraph(ze_graph_handle_t handle,
                NetworkMetadata metadata,
                const Config& config,
-               std::optional<std::unique_ptr<BlobContainer>> blob)
+               std::optional<std::unique_ptr<BlobContainer>> blobPtr)
     : _handle(handle),
       _metadata(std::move(metadata)),
       _logger("IGraph", config.get<LOG_LEVEL>()) {
-    if (blob.has_value()) {
-        _blob = std::move(*blob);
+    if (blobPtr.has_value()) {
+        _blobPtr = std::move(*blobPtr);
     }
 }
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
index 0019eb1bdf17d4..3ce216c255f0e4 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
@@ -139,7 +139,7 @@ void DriverGraph::initialize(const Config& config) {
 }
 
 bool DriverGraph::release_blob(const Config& config) {
-    if (_blob == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
+    if (_blobPtr == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
         config.get<PERF_COUNT>()) {
         return false;
     }
@@ -152,7 +152,7 @@ bool DriverGraph::release_blob(const Config& config) {
         return false;
     }
 
-    if (!_blob->release_from_memory()) {
+    if (!_blobPtr->release_from_memory()) {
         return false;
     }
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
index ce02e0caad8edd..87d530a4086817 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
@@ -34,7 +34,7 @@ PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
 }
 
 void PluginGraph::export_blob(std::ostream& stream) const {
-    stream.write(reinterpret_cast<const char*>(_blob->get_ptr()), _blob->size());
+    stream.write(reinterpret_cast<const char*>(_blobPtr->get_ptr()), _blobPtr->size());
 
     if (!stream) {
         _logger.error("Write blob to stream failed. Blob is broken!");
@@ -43,14 +43,14 @@ void PluginGraph::export_blob(std::ostream& stream) const {
 
     if (_logger.level() >= ov::log::Level::INFO) {
         std::uint32_t result = 1171117u;
-        for (const uint8_t* it = reinterpret_cast<const uint8_t*>(_blob->get_ptr());
-             it != reinterpret_cast<const uint8_t*>(_blob->get_ptr()) + _blob->size();
+        for (const uint8_t* it = reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr());
+             it != reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()) + _blobPtr->size();
              ++it) {
             result = ((result << 7) + result) + static_cast<uint32_t>(*it);
         }
 
         std::stringstream str;
-        str << "Blob size: " << _blob->size() << ", hash: " << std::hex << result;
+        str << "Blob size: " << _blobPtr->size() << ", hash: " << std::hex << result;
         _logger.info(str.str().c_str());
     }
     _logger.info("Write blob to stream successfully.");
@@ -58,9 +58,9 @@ void PluginGraph::export_blob(std::ostream& stream) const {
 
 std::vector<ov::ProfilingInfo> PluginGraph::process_profiling_output(const std::vector<uint8_t>& profData,
                                                                      const Config& config) const {
-    std::vector<uint8_t> blob(_blob->size());
-    blob.assign(reinterpret_cast<const uint8_t*>(_blob->get_ptr()),
-                reinterpret_cast<const uint8_t*>(_blob->get_ptr()) + _blob->size());
+    std::vector<uint8_t> blob(_blobPtr->size());
+    blob.assign(reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()),
+                reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()) + _blobPtr->size());
     return _compiler->process_profiling_output(profData, blob, config);
 }
 
diff --git a/src/plugins/intel_npu/src/plugin/include/plugin.hpp b/src/plugins/intel_npu/src/plugin/include/plugin.hpp
index c91af51c5443ce..6b1b46872788e3 100644
--- a/src/plugins/intel_npu/src/plugin/include/plugin.hpp
+++ b/src/plugins/intel_npu/src/plugin/include/plugin.hpp
@@ -44,16 +44,7 @@ class Plugin : public ov::IPlugin {
 
     std::shared_ptr<ov::ICompiledModel> import_model(std::istream& stream, const ov::AnyMap& properties) const override;
 
-    std::shared_ptr<ov::ICompiledModel> import_model(std::istream& /* unusedStream */,
-                                                     std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                     const ov::AnyMap& properties) const override;
-
-    std::shared_ptr<ov::ICompiledModel> import_model(std::istream& stream,
-                                                     const ov::SoPtr<ov::IRemoteContext>& context,
-                                                     const ov::AnyMap& properties) const override;
-
     std::shared_ptr<ov::ICompiledModel> import_model(std::istream& stream,
-                                                     std::shared_ptr<ov::AlignedBuffer> model_buffer,
                                                      const ov::SoPtr<ov::IRemoteContext>& context,
                                                      const ov::AnyMap& properties) const override;
 
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
index dd322c0ed47962..bcebc46fb22114 100644
--- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -16,8 +16,8 @@
 #include "intel_npu/config/compiler.hpp"
 #include "intel_npu/config/npuw.hpp"
 #include "intel_npu/config/runtime.hpp"
-#include "intel_npu/utils/zero/zero_init.hpp"
 #include "npuw/compiled_model.hpp"
+#include "intel_npu/utils/zero/zero_init.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/parameter.hpp"
 #include "openvino/runtime/intel_npu/properties.hpp"
@@ -752,7 +752,14 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& stream, c
     OV_ITT_SCOPED_TASK(itt::domains::NPUPlugin, "Plugin::import_model");
     OV_ITT_TASK_CHAIN(PLUGIN_IMPORT_MODEL, itt::domains::NPUPlugin, "Plugin::import_model", "merge_configs");
 
-    const std::map<std::string, std::string> propertiesMap = any_copy(properties);
+    auto _properties = properties;
+    std::shared_ptr<ov::AlignedBuffer> modelBuffer;
+    if (_properties.count(ov::internal::cached_model_buffer.name())) {
+        modelBuffer = _properties.at(ov::internal::cached_model_buffer.name()).as<std::shared_ptr<ov::AlignedBuffer>>();
+        _properties.erase(ov::internal::cached_model_buffer.name());
+    }
+
+    const std::map<std::string, std::string> propertiesMap = any_copy(_properties);
     auto localConfig = merge_configs(_globalConfig, propertiesMap, OptionMode::RunTime);
     _logger.setLevel(localConfig.get<LOG_LEVEL>());
     const auto platform = _backends->getCompilationPlatform(localConfig.get<PLATFORM>(), localConfig.get<DEVICE_ID>());
@@ -774,64 +781,25 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& stream, c
     try {
         CompilerAdapterFactory compilerAdapterFactory;
         auto compiler = compilerAdapterFactory.getCompiler(_backends->getIEngineBackend(), localConfig);
+        
+        std::unique_ptr<BlobContainer> blobPtr;
 
-        auto graphSize = getFileSize(stream);
-
-        std::vector<uint8_t> blob(graphSize);
-        stream.read(reinterpret_cast<char*>(blob.data()), graphSize);
-        if (!stream) {
-            OPENVINO_THROW("Failed to read data from stream!");
-        }
-        _logger.debug("Successfully read %zu bytes into blob.", graphSize);
-
-        auto blobContainerPtr = std::make_unique<BlobContainerVector>(std::move(blob));
-        auto graph = compiler->parse(std::move(blobContainerPtr), localConfig);
-        graph->update_network_name("net" + std::to_string(_compiledModelLoadCounter++));
-
-        const std::shared_ptr<ov::Model> modelDummy =
-            create_dummy_model(graph->get_metadata().inputs, graph->get_metadata().outputs);
-
-        compiledModel = std::make_shared<CompiledModel>(modelDummy, shared_from_this(), device, graph, localConfig);
-    } catch (const std::exception& ex) {
-        OPENVINO_THROW("Can't import network: ", ex.what());
-    } catch (...) {
-        OPENVINO_THROW("NPU import_model got unexpected exception from CompiledModel");
-    }
-
-    OV_ITT_TASK_SKIP(PLUGIN_IMPORT_MODEL);
+        if (modelBuffer == nullptr) {
+            auto graphSize = getFileSize(stream);
 
-    return compiledModel;
-}
-
-std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& /* unusedStream */,
-                                                         std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                         const ov::AnyMap& properties) const {
-    OV_ITT_SCOPED_TASK(itt::domains::NPUPlugin, "Plugin::import_model");
-    OV_ITT_TASK_CHAIN(PLUGIN_IMPORT_MODEL, itt::domains::NPUPlugin, "Plugin::import_model", "merge_configs");
-
-    const std::map<std::string, std::string> propertiesMap = any_copy(properties);
-    auto localConfig = merge_configs(_globalConfig, propertiesMap, OptionMode::RunTime);
-    _logger.setLevel(localConfig.get<LOG_LEVEL>());
-    const auto platform = _backends->getCompilationPlatform(localConfig.get<PLATFORM>(), localConfig.get<DEVICE_ID>());
-    localConfig.update({{ov::intel_npu::platform.name(), platform}});
-    auto device = _backends->getDevice(localConfig.get<DEVICE_ID>());
-
-    set_batch_config(_backends->isBatchingSupported(), localConfig);
-
-    const auto loadedFromCache = localConfig.get<LOADED_FROM_CACHE>();
-    if (!loadedFromCache) {
-        _logger.warning(
-            "The usage of a compiled model can lead to undefined behavior. Please use OpenVINO IR instead!");
-    }
-
-    OV_ITT_TASK_NEXT(PLUGIN_IMPORT_MODEL, "parse");
+            std::vector<uint8_t> blob(graphSize);
+            stream.read(reinterpret_cast<char*>(blob.data()), graphSize);
+            if (!stream) {
+                OPENVINO_THROW("Failed to read data from stream!");
+            }
+            _logger.debug("Successfully read %zu bytes into blob.", graphSize);
 
-    std::shared_ptr<ov::ICompiledModel> compiledModel;
+            blobPtr = std::move(std::make_unique<BlobContainerVector>(std::move(blob)));
+        } else {
+            blobPtr = std::move(std::make_unique<BlobContainerAlignedBuffer>(modelBuffer, stream.tellg()));
+        }
 
-    try {
-        auto compiler = getCompiler(localConfig);
-        auto blobContainerPtr = std::make_unique<BlobContainerAlignedBuffer>(model_buffer);
-        auto graph = compiler->parse(std::move(blobContainerPtr), localConfig);
+        auto graph = compiler->parse(std::move(blobPtr), localConfig);
         graph->update_network_name("net" + std::to_string(_compiledModelLoadCounter++));
 
         const std::shared_ptr<ov::Model> modelDummy =
@@ -857,19 +825,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& stream,
         OPENVINO_THROW("Invalid remote context type. Can't cast to ov::intel_npu::RemoteContext type");
     }
 
-    return import_model(stream, context, properties);
-}
-
-std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& stream,
-                                                         std::shared_ptr<ov::AlignedBuffer> model_buffer,
-                                                         const ov::SoPtr<ov::IRemoteContext>& context,
-                                                         const ov::AnyMap& properties) const {
-    auto casted = std::dynamic_pointer_cast<RemoteContextImpl>(context._ptr);
-    if (casted == nullptr) {
-        OPENVINO_THROW("Invalid remote context type. Can't cast to ov::intel_npu::RemoteContext type");
-    }
-
-    return import_model(stream, model_buffer, properties);
+    return import_model(stream, properties);
 }
 
 ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>& model,