Skip to content

Commit

Permalink
Add chrome trace
Browse files Browse the repository at this point in the history
  • Loading branch information
usstq authored and dmitry-gorokhov committed Sep 3, 2024
1 parent f7ade08 commit f7c5d3c
Show file tree
Hide file tree
Showing 4 changed files with 262 additions and 1 deletion.
10 changes: 9 additions & 1 deletion src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "utils/general_utils.h"
#include "utils/ngraph_utils.hpp"
#include "utils/node_dumper.h"
#include "utils/profiler.hpp"
#include "utils/verbose.h"
#include "utils/precision_support.h"

Expand Down Expand Up @@ -1014,6 +1015,7 @@ bool Graph::ProcessDynNodes() {
}

void Graph::PushInputData(const std::size_t& index, const ov::SoPtr<ITensor>& input) {
PROFILE(_prof, "Graph::PushInputData");
if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready.");
auto input_itr = inputNodesMap.find(index);
if (input_itr != inputNodesMap.end()) {
Expand Down Expand Up @@ -1046,6 +1048,7 @@ void Graph::PushInputData(const std::size_t& index, const ov::SoPtr<ITensor>& in

// suppose always being shared infer_request intel_cpu::Tensor to Graph if isDynamic.
void Graph::PullOutputData(std::unordered_map<std::size_t, ov::SoPtr<ITensor>>& output) {
PROFILE(_prof, "Graph::PullOutputData");
if (!IsReady())
OPENVINO_THROW("Wrong state. Topology not ready.");

Expand Down Expand Up @@ -1335,6 +1338,7 @@ class UpdateNodes : public UpdateNodesBase {
/* group all the profiling macros into a single one
* to avoid cluttering a core logic */
#define VERBOSE_PERF_DUMP_ITT_DEBUG_LOG(ittScope, node, config) \
PROFILE(_prof, node->getTypeStr(), node->getName()); \
VERBOSE(node, config.debugCaps.verbose, infer_count); \
PERF(node, config.collectPerfCounters, perfKey); \
DUMP(node, config.debugCaps, nestingLevel, infer_count); \
Expand All @@ -1360,11 +1364,15 @@ inline void Graph::ExecuteNodeWithCatch(const NodePtr& node, SyncInferRequest* r

template<typename UpdateStrategy>
void Graph::InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update) {
PROFILE(_prof0, std::string("Graph::InferDynamic_#") + std::to_string(infer_count));
CPU_DEBUG_CAP_ENABLE(const PerfKey perfKey = perfGetKey(*this));

size_t inferCounter = 0;
for (auto stopIndx : m_executableSyncNodesInds) {
update(stopIndx);
{
PROFILE(_prof, "updateNodes");
update(stopIndx);
}

for (; inferCounter < stopIndx; ++inferCounter) {
auto& node = m_executableGraphNodes[inferCounter];
Expand Down
5 changes: 5 additions & 0 deletions src/plugins/intel_cpu/src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <dnnl_debug.h>
#include "utils/general_utils.h"
#include "utils/cpu_utils.hpp"
#include "utils/verbose.h"
#include "utils/profiler.hpp"
#include "nodes/common/cpu_convert.h"
#include "memory_desc/cpu_memory_desc_utils.h"
#include "memory_desc/dnnl_blocked_memory_desc.h"
Expand Down Expand Up @@ -555,12 +557,14 @@ void Node::updateShapes() {
IShapeInfer::Result result = {{}, ShapeInferStatus::skip};
{
PERF_SHAPE_INFER(this);
PROFILE(_prof, "updateShape", getName());
if (needShapeInfer()) {
result = shapeInfer();
}
}
{
PERF_PREDEFINE_OUTPUT_MEMORY(this);
PROFILE(_prof, "redefineOutputMemory", getName());
if (ShapeInferStatus::success == result.status) {
redefineOutputMemory(result.dims);
}
Expand All @@ -578,6 +582,7 @@ void Node::updateDynamicParams() {
getName());
try {
PERF_PREPARE_PARAMS(this);
PROFILE(_prof, "updateDynamicParams", getName());
if (isExecutable()) {
if (needPrepareParams()) {
OPENVINO_ASSERT(inputShapesDefined(),
Expand Down
165 changes: 165 additions & 0 deletions src/plugins/intel_cpu/src/utils/profiler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "profiler.hpp"

#include <algorithm>
#include <cassert>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>

#ifdef OV_CPU_WITH_PROFILER
namespace ov {
namespace intel_cpu {

uint64_t tsc_ticks_per_second;
uint64_t tsc_ticks_base;
inline uint64_t tsc_to_usec(uint64_t tsc_ticks) {
return (tsc_ticks - tsc_ticks_base) * 1000000 / tsc_ticks_per_second;
}

void init_tsc() {
static std::once_flag flag;
std::call_once(flag, []() {
uint64_t start_ticks = __rdtsc();
std::this_thread::sleep_for(std::chrono::seconds(1));
tsc_ticks_per_second = (__rdtsc() - start_ticks);
std::cout << "[OV_CPU_PROFILE] tsc_ticks_per_second = " << tsc_ticks_per_second << std::endl;
tsc_ticks_base = __rdtsc();
});
}

// to minimize the overhead of profiler when it's not being enabled,
// the inst is not put inside a singleton function to save extra
// cost in multi-threading safety checks.

struct chromeTrace {
std::ostream& os;
int fake_tid;
uint64_t ts;
chromeTrace(std::ostream& os, int fake_tid) : os(os), fake_tid(fake_tid) {}
void addCompleteEvent(std::string name,
std::string cat,
uint64_t start,
uint64_t dur) {
// chrome tracing will show & group-by to name, so we use cat as name
os << "{\"ph\": \"X\", \"name\": \"" << cat << "\", \"cat\":\"" << name << "\","
<< "\"pid\": " << fake_tid << ", \"tid\": 0,"
<< "\"ts\": " << start << ", \"dur\": " << dur << "},\n";
}
};

thread_local ProfilerManager profilerManagerInstance;

static std::atomic_int totalProfilerManagers{0};

void ProfileData::record_end(ProfileData* p) {
p->end = __rdtsc();
}

bool not_finalized = true;

struct ProfilerManagerFinalizer {
std::mutex g_mutex;
std::set<ProfilerManager*> all_managers;
const char* dump_file_name = "ov_profile.json";
bool dump_file_over = false;

~ProfilerManagerFinalizer() {
if (not_finalized)
finalize();
}

void finalize() {
if (!not_finalized)
return;

std::lock_guard<std::mutex> guard(g_mutex);
if (dump_file_over || all_managers.empty())
return;

// start dump
std::ofstream fw;
fw.open(dump_file_name, std::ios::out);
fw << "{\n";
fw << "\"schemaVersion\": 1,\n";
fw << "\"traceEvents\": [\n";
fw.flush();

for (auto& pthis : all_managers) {
if (!pthis->enabled)
continue;
auto data_size = pthis->all_data.size();
if (!data_size)
continue;

// open output file
chromeTrace ct(fw, pthis->serial);
for (auto& d : pthis->all_data) {
ct.addCompleteEvent(d.name,
d.cat,
tsc_to_usec(d.start),
tsc_to_usec(d.end) - tsc_to_usec(d.start));
}
pthis->all_data.clear();
std::cout << "[OV_CPU_PROFILE] #" << pthis->serial << "(" << pthis << ") finalize: dumpped "
<< data_size << std::endl;
}
all_managers.clear();

fw << R"({
"name": "Profiler End",
"ph": "i",
"s": "g",
"pid": "Traces",
"tid": "Trace OV Profiler",
"ts":)"
<< tsc_to_usec(__rdtsc()) << "}",
fw << "]\n";
fw << "}\n";
auto total_size = fw.tellp();
fw.close();
dump_file_over = true;
not_finalized = false;
std::cout << "[OV_CPU_PROFILE] Dumpped " << total_size / (1024 * 1024) << " (MB) to " << dump_file_name
<< std::endl;
}

void register_manager(ProfilerManager* pthis) {
std::lock_guard<std::mutex> guard(g_mutex);
std::stringstream ss;
ss << "[OV_CPU_PROFILE] #" << pthis->serial << "(" << this << ") : is registed." << std::endl;
std::cout << ss.str();
all_managers.emplace(pthis);
}
} finalizer;

ProfilerManager::ProfilerManager() {
const char* str_enable = std::getenv("OV_CPU_PROFILE");
if (!str_enable)
str_enable = "0";
enabled = atoi(str_enable) > 0;
if (enabled) {
init_tsc();
tid = std::this_thread::get_id();
serial = totalProfilerManagers.fetch_add(1);
finalizer.register_manager(this);
}
}

ProfilerManager::~ProfilerManager() {
if (not_finalized)
finalizer.finalize();
}

} // namespace intel_cpu
} // namespace ov
#endif
83 changes: 83 additions & 0 deletions src/plugins/intel_cpu/src/utils/profiler.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once
#define OV_CPU_WITH_PROFILER
#ifdef OV_CPU_WITH_PROFILER

#include <atomic>
#include <chrono>
#include <cstddef>
#include <deque>
#include <fstream>
#include <functional>
#include <iostream>
#include <map>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>

extern "C" {
#ifdef _WIN32
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
}

namespace ov {
namespace intel_cpu {

struct ProfileData {
uint64_t start;
uint64_t end;
std::string cat;
std::string name;

ProfileData(const std::string& cat, const std::string& name) : cat(cat), name(name) {
start = __rdtsc();
}
static void record_end(ProfileData* p);
};

typedef std::unique_ptr<ProfileData, void (*)(ProfileData*)> ProfileDataHandle;

class chromeTrace;
struct ProfilerManagerFinalizer;

class ProfilerManager {
bool enabled;
std::deque<ProfileData> all_data;
std::thread::id tid;
int serial;

public:
ProfilerManager();
~ProfilerManager();

ProfileDataHandle startProfile(const std::string& cat, const std::string& name = {}) {
if (!enabled) {
return ProfileDataHandle(nullptr, [](ProfileData*) {});
}
all_data.emplace_back(cat, name);
return ProfileDataHandle(&all_data.back(), ProfileData::record_end);
}

friend class ProfilerManagerFinalizer;
};

extern thread_local ProfilerManager profilerManagerInstance;

} // namespace intel_cpu
} // namespace ov

# define PROFILE(var_name, ...) \
auto var_name = ov::intel_cpu::profilerManagerInstance.startProfile(__VA_ARGS__); \
(void)var_name;
#else

# define PROFILE(var_name, ...)

#endif

0 comments on commit f7c5d3c

Please sign in to comment.