Skip to content

Commit

Permalink
Simplify calling context managment, include timepoint tracking
Browse files Browse the repository at this point in the history
-> rename ref stuff

All the stuff just being reference numbers in OTF2 underneath is such
an unimportant implementation detail. rename it to "LocalCctx"
"GlobalCctx". This way it becomes clear that we first generate local
calling contexts in every sample::Writer and then map them to the global
calling contexts later on in post processing.

-> simplify calling context tracking

most of it was needlessly complex, as we don't need to record context in
most cases we simply have it.

-> use Mmap2 Record format instead of mmap2

according to the perf_event_open man page, RecordMmap does not support adding the
sample_id fields, such as the event timestamp to RecordMmap events, only
to RecordMmap2 events, so change to the latest and greatest format.

-> introduce exec and mmap event timestamp tracking

Keep track of execs and when memory mappings happen by time.

Currently, lo2s does process tracking solely by pid. This approach
failes in the context of execs, where instruction pointer 0xc0ffee can
refer to one thing before the exec and to something different altogether
after the exec.

We now can differentiate between different processes sharing the same
pid over time.

However, one important part is still missing, time tracking for
instruction pointers. This is programmatically the hard part.

But for now this at least makes the common case (fork() then immediate
exec()) work better.

-> simplify cctx tracking

Track Thread CCTX (used for calling context enter/leave events)
separately from Instruction Pointer CCTx (used for calling context
sample events). Remove a layer of indirection. Track instruction
pointers per process instead of per thread. (Threads can not have
different memory images.
  • Loading branch information
cvonelm committed Nov 7, 2024
1 parent a622be3 commit e0101d5
Show file tree
Hide file tree
Showing 11 changed files with 248 additions and 217 deletions.
8 changes: 4 additions & 4 deletions include/lo2s/monitor/main_monitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#ifdef HAVE_SENSORS
#include <lo2s/metric/sensors/recorder.hpp>
#endif
#include <lo2s/mmap.hpp>
#include <lo2s/monitor/io_monitor.hpp>
#ifdef HAVE_VEOSINFO
#include <lo2s/monitor/nec_monitor_main.hpp>
Expand Down Expand Up @@ -61,16 +60,17 @@ class MainMonitor
return trace_;
}

void insert_cached_mmap_events(const RawMemoryMapCache& cached_events);
void insert_cached_events(const RawMemoryMapCache& cached_events,
const RawCommCache& cached_comms);

std::map<Process, ProcessInfo>& get_process_infos()
ProcessMap& get_process_infos()
{
return process_infos_;
}

protected:
trace::Trace trace_;
std::map<Process, ProcessInfo> process_infos_;
ProcessMap process_infos_;
metric::plugin::Metrics metrics_;
std::vector<std::unique_ptr<TracepointMonitor>> tracepoint_monitors_;

Expand Down
137 changes: 84 additions & 53 deletions include/lo2s/perf/calling_context_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,62 +20,45 @@
*/
#pragma once

#include <lo2s/config.hpp>
#include <lo2s/trace/trace.hpp>
#include <lo2s/address.hpp>
#include <lo2s/log.hpp>

#include <otf2xx/otf2.hpp>

namespace lo2s
extern "C"
{
namespace perf
#include <linux/perf_event.h>
}

namespace lo2s
{

class CallingContextManager
struct LocalCctx
{
public:
CallingContextManager(trace::Trace& trace) : local_cctx_refs_(trace.create_cctx_refs())
LocalCctx(otf2::definition::calling_context::reference_type r) : ref(r)
{
}

void thread_enter(Process process, Thread thread)
{
auto ret =
local_cctx_refs_.map.emplace(std::piecewise_construct, std::forward_as_tuple(thread),
std::forward_as_tuple(process, next_cctx_ref_));
if (ret.second)
{
next_cctx_ref_++;
}
otf2::definition::calling_context::reference_type ref;
std::map<Address, LocalCctx> children;
};

current_thread_cctx_refs_ = &(*ret.first);
class LocalCctxMap
{
public:
LocalCctxMap()
{
}

void finalize(otf2::writer::local* otf2_writer)
{
local_cctx_refs_.ref_count = next_cctx_ref_;
ref_count_ = next_cctx_ref_;
// set writer last, because it is used as sentry to confirm that the cctx refs are properly
// finalized.
local_cctx_refs_.writer = otf2_writer;
}

bool thread_changed(Thread thread)
{
return !current_thread_cctx_refs_ || current_thread_cctx_refs_->first != thread;
}

otf2::definition::calling_context::reference_type current()
{
if (current_thread_cctx_refs_)
{
return current_thread_cctx_refs_->second.entry.ref;
}
else
{
return otf2::definition::calling_context::reference_type::undefined();
}
writer_ = otf2_writer;
}

otf2::definition::calling_context::reference_type sample_ref(uint64_t num_ips,
otf2::definition::calling_context::reference_type sample_ref(Process p, uint64_t num_ips,
const uint64_t ips[])
{
// For unwind distance definiton, see:
Expand All @@ -96,40 +79,80 @@ class CallingContextManager
// information.
//
// Having these things in mind, look at this line and tell me, why it is still wrong:
auto children = &current_thread_cctx_refs_->second.entry.children;
auto children = &map[p];
uint64_t ref = -1;
for (uint64_t i = num_ips - 1;; i--)
{
if (ips[i] == PERF_CONTEXT_KERNEL)
{
if (i <= 1)
{
return ref;
}
continue;
}
else if (ips[i] == PERF_CONTEXT_USER)
{
if (i <= 1)
{
return ref;
}
continue;
}
auto it = find_ip_child(ips[i], *children);
// We intentionally discard the last sample as it is somewhere in the kernel
if (i == 1)
ref = it->second.ref;
if (i == 0)
{
return it->second.ref;
return ref;
}

children = &it->second.children;
}
}

otf2::definition::calling_context::reference_type sample_ref(uint64_t ip)
otf2::definition::calling_context::reference_type sample_ref(Process p, uint64_t ip)
{
auto it = find_ip_child(ip, current_thread_cctx_refs_->second.entry.children);
auto it = find_ip_child(ip, map[p]);

return it->second.ref;
}

void thread_leave(Thread thread)
otf2::definition::calling_context::reference_type thread(Process process, Thread thread)
{
assert(current_thread_cctx_refs_);
if (current_thread_cctx_refs_->first != thread)
auto ret =
thread_cctxs_[process].emplace(std::piecewise_construct, std::forward_as_tuple(thread),
std::forward_as_tuple(next_cctx_ref_));
if (ret.second)
{
Log::debug() << "inconsistent leave thread"; // will probably set to trace sooner or
// later
next_cctx_ref_++;
}
current_thread_cctx_refs_ = nullptr;

return ret.first->second.ref;
}

size_t num_cctx() const
{
return ref_count_;
}

const std::map<Process, std::map<Thread, LocalCctx>>& get_threads() const
{
return thread_cctxs_;
}

const std::map<Process, std::map<Address, LocalCctx>>& get_functions() const
{
return map;
}

otf2::writer::local* writer()
{
return writer_;
}

private:
trace::IpRefMap::iterator find_ip_child(Address addr, trace::IpRefMap& children)
std::map<Address, LocalCctx>::iterator find_ip_child(Address addr,
std::map<Address, LocalCctx>& children)
{
// -1 can't be inserted into the ip map, as it imples a 1-byte region from -1 to 0.
if (addr == -1)
Expand All @@ -147,9 +170,17 @@ class CallingContextManager
}

private:
trace::ThreadCctxRefMap& local_cctx_refs_;
std::map<Process, std::map<Address, LocalCctx>> map;
std::map<Process, std::map<Thread, LocalCctx>> thread_cctxs_;

/*
* Stores calling context information for each sample writer / monitoring thread.
* While the `Trace` always owns this data, the `sample::Writer` should have exclusive access to
* this data during its lifetime. Only afterwards, the `writer` and `refcount` are set by the
* `sample::Writer`.
*/
std::atomic<otf2::writer::local*> writer_ = nullptr;
std::atomic<size_t> ref_count_;
size_t next_cctx_ref_ = 0;
trace::ThreadCctxRefMap::value_type* current_thread_cctx_refs_ = nullptr;
};
} // namespace perf
} // namespace lo2s
1 change: 1 addition & 0 deletions include/lo2s/perf/sample/reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ class Reader : public EventReader<T>
perf_attr.config1 = sampling_event.config1;

perf_attr.mmap = 1;
perf_attr.mmap2 = 1;
}
else
{
Expand Down
10 changes: 6 additions & 4 deletions include/lo2s/perf/sample/writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#pragma once

#include <lo2s/address.hpp>
#include <lo2s/mmap.hpp>
#include <lo2s/perf/calling_context_manager.hpp>

#include <lo2s/perf/sample/reader.hpp>
Expand Down Expand Up @@ -64,7 +63,7 @@ class Writer : public Reader<Writer>
public:
using Reader<Writer>::handle;
bool handle(const Reader::RecordSampleType* sample);
bool handle(const Reader::RecordMmapType* mmap_event);
bool handle(const Reader::RecordMmap2Type* mmap_event);
bool handle(const Reader::RecordCommType* comm);
bool handle(const Reader::RecordSwitchCpuWideType* context_switch);
bool handle(const Reader::RecordSwitchType* context_switch);
Expand All @@ -76,7 +75,7 @@ class Writer : public Reader<Writer>
void update_calling_context(Process process, Thread thread, otf2::chrono::time_point tp,
bool switch_out);

void leave_current_thread(Thread thread, otf2::chrono::time_point tp);
void leave_current_thread(Process process, Thread thread, otf2::chrono::time_point tp);
otf2::chrono::time_point adjust_timepoints(otf2::chrono::time_point tp);

ExecutionScope scope_;
Expand All @@ -89,15 +88,18 @@ class Writer : public Reader<Writer>
otf2::definition::metric_instance cpuid_metric_instance_;
otf2::event::metric cpuid_metric_event_;

CallingContextManager cctx_manager_;
LocalCctxMap& local_cctx_map_;
RawMemoryMapCache cached_mmap_events_;
RawCommCache cached_comm_events_;
std::unordered_map<Thread, std::string> comms_;

const time::Converter time_converter_;

bool first_event_ = true;
otf2::chrono::time_point first_time_point_;
otf2::chrono::time_point last_time_point_;
Process cur_process_ = Process::invalid();
Thread cur_thread_ = Thread::invalid();
};
} // namespace sample
} // namespace perf
Expand Down
4 changes: 2 additions & 2 deletions include/lo2s/perf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ class PerfEventCache
PerfEventCache(const PerfEventCache&) = delete;
PerfEventCache& operator=(const PerfEventCache&) = delete;

PerfEventCache(const T* event, size_t size) : data_(std::make_unique<std::byte[]>(size))
PerfEventCache(const T* event) : data_(std::make_unique<std::byte[]>(event->header.size))
{
memcpy(data_.get(), event, size);
memcpy(data_.get(), event, event->header.size);
}

T* get()
Expand Down
Loading

0 comments on commit e0101d5

Please sign in to comment.