From 73f25cf098a1efa9cf2b889e28dfd98bd447cc4a Mon Sep 17 00:00:00 2001
From: liubo-intel <bo4.liu@intel.com>
Date: Wed, 3 Jul 2024 00:58:16 +0800
Subject: [PATCH] CPU core numbers heuristic by weight size

heuristic update
---
 samples/cpp/benchmark_app/main.cpp            |  6 ++--
 src/inference/src/dev/core_impl.cpp           | 31 ++++++++++++++++++-
 src/inference/src/dev/core_impl.hpp           |  4 +++
 src/plugins/intel_cpu/src/config.cpp          |  2 ++
 src/plugins/intel_cpu/src/config.h            |  1 +
 .../intel_cpu/src/cpu_streams_calculation.cpp | 23 ++++++++++++--
 6 files changed, 62 insertions(+), 5 deletions(-)
diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp
index 3ab21dc0489286..2ca07ea86e4a03 100644
--- a/samples/cpp/benchmark_app/main.cpp
+++ b/samples/cpp/benchmark_app/main.cpp
@@ -639,8 +639,8 @@ int main(int argc, char* argv[]) {
             }
 
             // 获取.bin文件的大小
-            long fileSize = getFileSize(binPath);
-            std::cout << "my test: The size of '" << binPath << "' is " << fileSize/(1024*1024) << " Mbytes." << std::endl;
+            int fileSize = static_cast<int>(getFileSize(binPath)/(1024*1024));
+            std::cout << "my test: The size of '" << binPath << "' is " << fileSize << " Mbytes." << std::endl;
 
             auto duration_ms = get_duration_ms_till_now(startTime);
             slog::info << "Read model took " << double_to_string(duration_ms) << " ms" << slog::endl;
@@ -788,6 +788,8 @@ int main(int argc, char* argv[]) {
             // --------------------------------------------------------
             next_step();
             startTime = Time::now();
+            // my test
+            // device_config["weights_size"] = fileSize;
             compiledModel = core.compile_model(model, device_name, device_config);
             duration_ms = get_duration_ms_till_now(startTime);
             slog::info << "Compile model took " << double_to_string(duration_ms) << " ms" << slog::endl;
diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp
index 637a5e45596357..71303d5eaba989 100644
--- a/src/inference/src/dev/core_impl.cpp
+++ b/src/inference/src/dev/core_impl.cpp
@@ -34,6 +34,7 @@
 #    include "openvino/proxy/plugin.hpp"
 #    include "openvino/proxy/properties.hpp"
 #endif
+#include "openvino/op/constant.hpp"
 
 ov::ICore::~ICore() = default;
 
@@ -722,6 +723,8 @@ ov::SoPtr<ov::ICompiledModel> ov::CoreImpl::compile_model(const std::shared_ptr<
     OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::LoadTime, "Core::compile_model::model");
     std::string deviceName = device_name;
     ov::AnyMap config_with_batch = config;
+    config_with_batch["weights_size"] = coreConfig.get_weight_size();
+
     // if auto-batching is applicable, the below function will patch the device name and config accordingly:
     auto model = apply_auto_batching(model_, deviceName, config_with_batch);
 
@@ -1525,6 +1528,15 @@ bool ov::CoreImpl::CoreConfig::get_enable_mmap() const {
     return _flag_enable_mmap;
 }
 
+int ov::CoreImpl::CoreConfig::get_weight_size() const {
+    return _weight_size;
+}
+
+void ov::CoreImpl::CoreConfig::set_weight_size(const int& size) {
+    std::lock_guard<std::mutex> lock(_cacheConfigMutex);
+    _weight_size = size;
+}
+
 // Creating thread-safe copy of config including shared_ptr to ICacheManager
 // Passing empty or not-existing name will return global cache config
 ov::CoreImpl::CoreConfig::CacheConfig ov::CoreImpl::CoreConfig::get_cache_config_for_device(
@@ -1582,7 +1594,24 @@ void ov::CoreImpl::add_mutex(const std::string& dev_name) {
 
 std::shared_ptr<ov::Model> ov::CoreImpl::read_model(const std::string& modelPath, const std::string& binPath) const {
     OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::ReadTime, "CoreImpl::read_model from file");
-    return ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap());
+    // return ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap());
+    auto ov_model = ov::util::read_model(modelPath, binPath, extensions, coreConfig.get_enable_mmap());
+    // cal weight size
+    float total_weight_size = 0;
+    for (auto& op : ov_model->get_ordered_ops()) {
+        if (auto constop = std::dynamic_pointer_cast<op::v0::Constant>(op)) {
+            auto weight = static_cast<float>(constop->get_byte_size()) / (1024 * 1024);
+            total_weight_size += weight;
+            // std::cout << "my test weight: " << weight << std::endl;
+        }
+    }
+    //my test fix number
+    // total_weight_size = 5;
+
+    const_cast<ov::CoreImpl::CoreConfig&>(coreConfig).set_weight_size(static_cast<int>(total_weight_size));
+    // std::cout << "my test Total weight: " << total_weight_size << std::endl;
+
+    return ov_model;
 }
 
 std::shared_ptr<ov::Model> ov::CoreImpl::read_model(const std::string& model,
diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp
index 40f2a15bb725e0..fc85df1e00c3d4 100644
--- a/src/inference/src/dev/core_impl.hpp
+++ b/src/inference/src/dev/core_impl.hpp
@@ -83,6 +83,9 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this<ov::ICore
 
         bool get_enable_mmap() const;
 
+        int get_weight_size() const;
+        void set_weight_size(const int& size);
+
         // Creating thread-safe copy of config including shared_ptr to ICacheManager
         // Passing empty or not-existing name will return global cache config
         CacheConfig get_cache_config_for_device(const ov::Plugin& plugin, ov::AnyMap& parsedConfig) const;
@@ -92,6 +95,7 @@ class CoreImpl : public ov::ICore, public std::enable_shared_from_this<ov::ICore
         CacheConfig _cacheConfig;
         std::map<std::string, CacheConfig> _cacheConfigPerDevice;
         bool _flag_enable_mmap = true;
+        int _weight_size = 0;
     };
 
     struct CacheContent {
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
index b1a57cedde5a22..85a3b09da15590 100644
--- a/src/plugins/intel_cpu/src/config.cpp
+++ b/src/plugins/intel_cpu/src/config.cpp
@@ -370,6 +370,8 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                                ov::hint::kv_cache_precision.name(),
                                ". Supported values: u8, bf16, f16, f32");
             }
+        } else if (key == "weights_size") {
+            weightSize = val.as<int>();
         } else {
             OPENVINO_THROW("NotFound: Unsupported property ", key, " by CPU plugin.");
         }
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
index 7580d0bb61baf9..bfe38ea603d359 100644
--- a/src/plugins/intel_cpu/src/config.h
+++ b/src/plugins/intel_cpu/src/config.h
@@ -100,6 +100,7 @@ struct Config {
 
     int modelPreferThreads = -1;
     ModelType modelType = ModelType::Unknown;
+    int weightSize = 0;
 
 #ifdef CPU_DEBUG_CAPS
     DebugCapsConfig debugCaps;
diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
index 08ba43f9eff106..6ce5b3a9e1babe 100644
--- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
+++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp
@@ -235,7 +235,12 @@ std::vector<std::vector<int>> get_streams_info_table(const int input_streams,
         } else if (hint_model_distribution_policy.size() == 0) {
             for (auto& row : proc_socket_table) {
                 if (row[PROC_SOCKET_ID] == current_socket_id) {
-                    n_threads_per_stream = std::max(n_threads_per_stream, row[ALL_PROC]);
+                    // n_threads_per_stream = std::max(n_threads_per_stream, row[ALL_PROC]);
+                    // my test: add update here?
+                    n_threads_per_stream =
+                        model_prefer_threads > 0
+                            ? std::min(std::max(n_threads_per_stream, row[ALL_PROC]), model_prefer_threads)
+                            : std::max(n_threads_per_stream, row[ALL_PROC]);
                 }
             }
         } else {
@@ -468,6 +473,7 @@ int get_model_prefer_threads(const int num_streams,
                              Config& config) {
     const int sockets = get_num_sockets();
     auto model_prefer = 0;
+    bool isNewXeon = false;
     if (-1 == config.modelPreferThreads) {
         const auto isa = dnnl::get_effective_cpu_isa();
         float isaSpecificThreshold = 1.0f;
@@ -484,7 +490,9 @@ int get_model_prefer_threads(const int num_streams,
             isaSpecificThreshold = 2.0f;
             break;
         case dnnl::cpu_isa::avx512_core_amx:
+        case dnnl::cpu_isa::avx512_core_amx_fp16:
             isaSpecificThreshold = 4.0f;
+            isNewXeon = true;
             break;
         default:
             isaSpecificThreshold = 1.0f;
@@ -552,6 +560,7 @@ int get_model_prefer_threads(const int num_streams,
 
     // latency
     if (num_streams <= sockets && num_streams > 0) {
+        bool llm_related = has_matmul_with_compressed_weights(model);
         if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && proc_type_table[0][MAIN_CORE_PROC] > 0) {
 #ifdef __APPLE__
             if ((proc_type_table.size() == 1) && (proc_type_table[0][EFFICIENT_CORE_PROC] > 0)) {
@@ -560,7 +569,6 @@ int get_model_prefer_threads(const int num_streams,
                                    : proc_type_table[0][ALL_PROC];
             }
 #else
-            bool llm_related = has_matmul_with_compressed_weights(model);
             bool int8_intensive = ov::op::util::has_op_with_type<ov::op::v0::FakeQuantize>(model) || llm_related;
             const int int8_threshold = 4;  // ~relative efficiency of the VNNI-intensive code for Big vs Little cores;
             const int fp32_threshold = 2;  // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores;
@@ -574,6 +582,17 @@ int get_model_prefer_threads(const int num_streams,
                                       : proc_type_table[0][MAIN_CORE_PROC])
                                : proc_type_table[0][MAIN_CORE_PROC] + proc_type_table[0][EFFICIENT_CORE_PROC];
 #endif
+        } else if (isNewXeon && !llm_related && proc_type_table.size() > 1) {
+            // // my test
+            // model_prefer = 32;
+            // TODO: config.weightSize threshold need tobe updated
+            if (config.weightSize <= 100) {
+                model_prefer = (proc_type_table[1][MAIN_CORE_PROC] > 32) ? 32 : proc_type_table[1][MAIN_CORE_PROC];
+            } else if ((proc_type_table.size() > 3) &&
+                       (proc_type_table[1][PROC_SOCKET_ID] == proc_type_table[2][PROC_SOCKET_ID]) &&
+                       (proc_type_table[1][MAIN_CORE_PROC] + proc_type_table[2][MAIN_CORE_PROC] >= 64)) {
+                model_prefer = proc_type_table[1][MAIN_CORE_PROC] + proc_type_table[2][MAIN_CORE_PROC];
+            }
         }
     } else {  // throughput
         model_prefer = config.modelPreferThreads;