diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 4f3bf381d20720..183ed6a6d6ac49 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -39,6 +39,7 @@ #include "utils/node_dumper.h" #include "utils/verbose.h" #include "utils/precision_support.h" +#include "utils/profiler.hpp" #include #include "common/primitive_desc_iface.hpp" @@ -1030,6 +1031,7 @@ bool Graph::ProcessDynNodes() { } void Graph::PushInputData(const std::size_t& index, const ov::SoPtr& input) { + PROFILE(_prof, "Graph::PushInputData"); if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready."); auto input_itr = inputNodesMap.find(index); if (input_itr != inputNodesMap.end()) { @@ -1062,6 +1064,7 @@ void Graph::PushInputData(const std::size_t& index, const ov::SoPtr& in // suppose always being shared infer_request intel_cpu::Tensor to Graph if isDynamic. void Graph::PullOutputData(std::unordered_map>& output) { + PROFILE(_prof, "Graph::PullOutputData"); if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready."); @@ -1362,6 +1365,7 @@ void Graph::InferDynamic(SyncInferRequest* request, UpdateStrategy&& update) { VERBOSE(node, getConfig().debugCaps.verbose); PERF(node, getConfig().collectPerfCounters); + PROFILE(_prof, node->getTypeStr(), node->getName()); if (request) request->throw_if_canceled(); try { @@ -1474,7 +1478,8 @@ void Graph::Infer(SyncInferRequest* request) { OPENVINO_ASSERT(IsReady(), "Wrong state of the ov::intel_cpu::Graph. Topology is not ready: ", static_cast(status)); } - if (infer_count != -1) infer_count++; + // if (infer_count != -1) infer_count++; + infer_count++; } void Graph::SortTopologically() { diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 326b3e907dcc8f..78249a33569647 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -35,6 +35,7 @@ #include "memory_desc/dnnl_blocked_memory_desc.h" #include #include +#include "utils/profiler.hpp" using namespace dnnl; using namespace openvino; @@ -551,6 +552,7 @@ void Node::updateShapes() { getName()); try { if (needShapeInfer()) { + PROFILE(_prof, "updateShape", getName()); auto result = shapeInfer(); if (ShapeInferStatus::success == result.status) { redefineOutputMemory(result.dims); @@ -568,6 +570,7 @@ void Node::updateDynamicParams() { " with name: ", getName()); try { + PROFILE(_prof, "updateDynamicParams", getName()); if (isExecutable()) { if (needPrepareParams()) { OPENVINO_ASSERT(inputShapesDefined(), diff --git a/src/plugins/intel_cpu/src/utils/profiler.cpp b/src/plugins/intel_cpu/src/utils/profiler.cpp new file mode 100644 index 00000000000000..e15b47c7faff21 --- /dev/null +++ b/src/plugins/intel_cpu/src/utils/profiler.cpp @@ -0,0 +1,165 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "profiler.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef OV_CPU_WITH_PROFILER +namespace ov { +namespace intel_cpu { + +uint64_t tsc_ticks_per_second; +uint64_t tsc_ticks_base; +inline uint64_t tsc_to_usec(uint64_t tsc_ticks) { + return (tsc_ticks - tsc_ticks_base) * 1000000 / tsc_ticks_per_second; +} + +void init_tsc() { + static std::once_flag flag; + std::call_once(flag, []() { + uint64_t start_ticks = __rdtsc(); + std::this_thread::sleep_for(std::chrono::seconds(1)); + tsc_ticks_per_second = (__rdtsc() - start_ticks); + std::cout << "[OV_CPU_PROFILE] tsc_ticks_per_second = " << tsc_ticks_per_second << std::endl; + tsc_ticks_base = __rdtsc(); + }); +} + +// to minimize the overhead of profiler when it's not being enabled, +// the inst is not put inside a singleton function to save extra +// cost in multi-threading safety checks. + +struct chromeTrace { + std::ostream& os; + int fake_tid; + uint64_t ts; + chromeTrace(std::ostream& os, int fake_tid) : os(os), fake_tid(fake_tid) {} + void addCompleteEvent(std::string name, + std::string cat, + uint64_t start, + uint64_t dur) { + // chrome tracing will show & group-by to name, so we use cat as name + os << "{\"ph\": \"X\", \"name\": \"" << cat << "\", \"cat\":\"" << name << "\"," + << "\"pid\": " << fake_tid << ", \"tid\": 0," + << "\"ts\": " << start << ", \"dur\": " << dur << "},\n"; + } +}; + +thread_local ProfilerManager profilerManagerInstance; + +static std::atomic_int totalProfilerManagers{0}; + +void ProfileData::record_end(ProfileData* p) { + p->end = __rdtsc(); +} + +bool not_finalized = true; + +struct ProfilerManagerFinalizer { + std::mutex g_mutex; + std::set all_managers; + const char* dump_file_name = "ov_profile.json"; + bool dump_file_over = false; + + ~ProfilerManagerFinalizer() { + if (not_finalized) + finalize(); + } + + void finalize() { + if (!not_finalized) + return; + + std::lock_guard guard(g_mutex); + if (dump_file_over || all_managers.empty()) + return; + + // start dump + std::ofstream fw; + fw.open(dump_file_name, std::ios::out); + fw << "{\n"; + fw << "\"schemaVersion\": 1,\n"; + fw << "\"traceEvents\": [\n"; + fw.flush(); + + for (auto& pthis : all_managers) { + if (!pthis->enabled) + continue; + auto data_size = pthis->all_data.size(); + if (!data_size) + continue; + + // open output file + chromeTrace ct(fw, pthis->serial); + for (auto& d : pthis->all_data) { + ct.addCompleteEvent(d.name, + d.cat, + tsc_to_usec(d.start), + tsc_to_usec(d.end) - tsc_to_usec(d.start)); + } + pthis->all_data.clear(); + std::cout << "[OV_CPU_PROFILE] #" << pthis->serial << "(" << pthis << ") finalize: dumpped " + << data_size << std::endl; + } + all_managers.clear(); + + fw << R"({ + "name": "Profiler End", + "ph": "i", + "s": "g", + "pid": "Traces", + "tid": "Trace OV Profiler", + "ts":)" + << tsc_to_usec(__rdtsc()) << "}", + fw << "]\n"; + fw << "}\n"; + auto total_size = fw.tellp(); + fw.close(); + dump_file_over = true; + not_finalized = false; + std::cout << "[OV_CPU_PROFILE] Dumpped " << total_size / (1024 * 1024) << " (MB) to " << dump_file_name + << std::endl; + } + + void register_manager(ProfilerManager* pthis) { + std::lock_guard guard(g_mutex); + std::stringstream ss; + ss << "[OV_CPU_PROFILE] #" << pthis->serial << "(" << this << ") : is registed." << std::endl; + std::cout << ss.str(); + all_managers.emplace(pthis); + } +} finalizer; + +ProfilerManager::ProfilerManager() { + const char* str_enable = std::getenv("OV_CPU_PROFILE"); + if (!str_enable) + str_enable = "0"; + enabled = atoi(str_enable) > 0; + if (enabled) { + init_tsc(); + tid = std::this_thread::get_id(); + serial = totalProfilerManagers.fetch_add(1); + finalizer.register_manager(this); + } +} + +ProfilerManager::~ProfilerManager() { + if (not_finalized) + finalizer.finalize(); +} + +} // namespace intel_cpu +} // namespace ov +#endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/utils/profiler.hpp b/src/plugins/intel_cpu/src/utils/profiler.hpp new file mode 100644 index 00000000000000..47686b6a74818e --- /dev/null +++ b/src/plugins/intel_cpu/src/utils/profiler.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#define OV_CPU_WITH_PROFILER +#ifdef OV_CPU_WITH_PROFILER + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { +#ifdef _WIN32 +# include +#else +# include +#endif +} + +namespace ov { +namespace intel_cpu { + +struct ProfileData { + uint64_t start; + uint64_t end; + std::string cat; + std::string name; + + ProfileData(const std::string& cat, const std::string& name) : cat(cat), name(name) { + start = __rdtsc(); + } + static void record_end(ProfileData* p); +}; + +typedef std::unique_ptr ProfileDataHandle; + +class chromeTrace; +struct ProfilerManagerFinalizer; + +class ProfilerManager { + bool enabled; + std::deque all_data; + std::thread::id tid; + int serial; + +public: + ProfilerManager(); + ~ProfilerManager(); + + ProfileDataHandle startProfile(const std::string& cat, const std::string& name = {}) { + if (!enabled) { + return ProfileDataHandle(nullptr, [](ProfileData*) {}); + } + all_data.emplace_back(cat, name); + return ProfileDataHandle(&all_data.back(), ProfileData::record_end); + } + + friend class ProfilerManagerFinalizer; +}; + +extern thread_local ProfilerManager profilerManagerInstance; + +} // namespace intel_cpu +} // namespace ov + +# define PROFILE(var_name, ...) \ + auto var_name = ov::intel_cpu::profilerManagerInstance.startProfile(__VA_ARGS__); \ + (void)var_name; +#else + +# define PROFILE(var_name, ...) + +#endif \ No newline at end of file