diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5deb99c7..62448002 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,6 +34,7 @@ IfUpdatedUnsetAll(lo2s_USE_STATIC_LIBS
Libpfm_USE_STATIC_LIBS
X86Adapt_STATIC
x86_energy_STATIC
+ CUDA_USE_STATIC_LIBS
)
if(lo2s_USE_STATIC_LIBS STREQUAL "OFF")
@@ -45,6 +46,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "OFF")
set(x86_energy_STATIC OFF CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS OFF CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS OFF CACHE BOOL "")
+ set(CUDA_USE_STATIC_LIBS OFF CACHE BOOL "")
endif()
if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY")
@@ -56,6 +58,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY")
set(x86_energy_STATIC ON CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS ON CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS ON CACHE BOOL "")
+ set(CUDA_USE_STATIC_LIBS ON CACHE BOOL "")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc")
endif()
@@ -68,6 +71,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "ALL")
set(x86_energy_STATIC ON CACHE BOOL "")
set(Sensors_USE_STATIC_LIBS ON CACHE BOOL "")
set(Libpfm_USE_STATIC_LIBS ON CACHE BOOL "")
+ set(CUDA_USE_STATIC_LIBS ON CACHE BOOL "")
# Doesn't seem to work with clang, even though it should,
# but at least it doesn't complain about it either
@@ -107,6 +111,7 @@ find_package(Sensors)
find_package(Veosinfo)
find_package(Libpfm)
find_package(PkgConfig)
+find_package(CUDAToolkit)
if(PkgConfig_FOUND)
pkg_check_modules(Audit audit)
@@ -129,6 +134,8 @@ CMAKE_DEPENDENT_OPTION(USE_LIBAUDIT "Use libaudit for syscall name resolution" O
add_feature_info("USE_LIBAUDIT" USE_LIBAUDIT "Use libaudit for syscall name resolution.")
CMAKE_DEPENDENT_OPTION(USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards." ON "Veosinfo_FOUND" OFF)
add_feature_info("USE_VEOSINFO" USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards.")
+CMAKE_DEPENDENT_OPTION(USE_CUPTI "Use CUPTI to record CUDA activity." ON "CUDAToolkit_FOUND" OFF)
+add_feature_info("USE_CUPTI" USE_CUPTI "Use CUPTI to record CUDA activity.")
# system configuration checks
CHECK_INCLUDE_FILES(linux/hw_breakpoint.h HAVE_HW_BREAKPOINT_H)
CHECK_STRUCT_HAS_MEMBER("struct perf_event_attr" clockid linux/perf_event.h HAVE_PERF_EVENT_ATTR_CLOCKID)
@@ -144,6 +151,13 @@ if(NOT CLOCK_GETTIME_FOUND)
unset(CMAKE_REQUIRED_LIBRARIES)
endif()
+check_function_exists(shm_open SHM_OPEN_FOUND)
+if(NOT SHM_OPEN_FOUND)
+ set(CMAKE_REQUIRED_LIBRARIES "rt")
+ check_function_exists(shm_open SHM_OPEN_FOUND_WITH_RT)
+ unset(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
CHECK_STRUCT_HAS_BITFIELD("struct perf_event_attr" context_switch linux/perf_event.h HAVE_PERF_RECORD_SWITCH)
if(NOT HAVE_PERF_RECORD_SWITCH)
@@ -226,6 +240,14 @@ if(NOT CLOCK_GETTIME_FOUND)
endif()
endif()
+if(NOT SHM_OPEN_FOUND)
+ if(SHM_OPEN_FOUND_WITH_RT)
+ target_link_libraries(lo2s PRIVATE rt)
+ else()
+ message(SEND_ERROR "Could not find the function shm_open(), but it is required.")
+ endif()
+endif()
+
# handle x86_adapt dependency
if(X86Adapt_FOUND)
target_sources(lo2s PRIVATE
@@ -306,6 +328,31 @@ if (USE_LIBAUDIT)
endif()
endif()
+set(LO2S_CUDA_INJECTIONLIB_PATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/liblo2s_injection.so")
+if(USE_CUPTI)
+ if(CUDAToolkit_FOUND)
+ add_library(lo2s_injection SHARED src/cupti/lib.cpp)
+ target_include_directories(lo2s_injection PRIVATE include
+ ${CMAKE_CURRENT_BINARY_DIR}/include)
+
+ if (CUDA_USE_STATIC_LIBS)
+ target_link_libraries(lo2s_injection PRIVATE CUDA::cupti_static)
+ else()
+ target_link_libraries(lo2s_injection PRIVATE CUDA::cupti)
+ endif()
+
+ if(SHM_OPEN_FOUND_WITH_RT)
+ target_link_libraries(lo2s_injection PRIVATE rt)
+ endif()
+
+ target_compile_definitions(lo2s PUBLIC HAVE_CUDA)
+ install(TARGETS lo2s_injection LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ else()
+ message(SEND_ERROR "Cupti not found but requested.")
+ endif()
+endif()
+
+
# generate version string used in lo2s
if(Git_FOUND)
diff --git a/include/lo2s/build_config.hpp.in b/include/lo2s/build_config.hpp.in
index a1363958..a89d446b 100644
--- a/include/lo2s/build_config.hpp.in
+++ b/include/lo2s/build_config.hpp.in
@@ -34,3 +34,7 @@
#cmakedefine LO2S_COPYRIGHT_YEAR "@LO2S_COPYRIGHT_YEAR@"
+
+// The CUDA injection library installation path
+
+#cmakedefine LO2S_CUDA_INJECTIONLIB_PATH "@LO2S_CUDA_INJECTIONLIB_PATH@"
diff --git a/include/lo2s/config.hpp b/include/lo2s/config.hpp
index 76427bf8..53fd6f36 100644
--- a/include/lo2s/config.hpp
+++ b/include/lo2s/config.hpp
@@ -98,6 +98,10 @@ struct Config
bool use_nec;
std::chrono::microseconds nec_read_interval;
std::chrono::milliseconds nec_check_interval;
+ // Nvidia CUPTI
+ bool use_nvidia;
+ std::string cuda_injectionlib_path;
+ uint64_t nvidia_ringbuf_size;
};
const Config& config();
diff --git a/include/lo2s/cupti/events.hpp b/include/lo2s/cupti/events.hpp
new file mode 100644
index 00000000..ba994458
--- /dev/null
+++ b/include/lo2s/cupti/events.hpp
@@ -0,0 +1,50 @@
+/*
+ * This file is part of the lo2s software.
+ * Linux OTF2 sampling
+ *
+ * Copyright (c) 2024,
+ * Technische Universitaet Dresden, Germany
+ *
+ * lo2s is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lo2s is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with lo2s. If not, see .
+ */
+
+#pragma once
+
+#include
+
+namespace lo2s
+{
+namespace cupti
+{
+enum class EventType : uint64_t
+{
+ CUPTI_KERNEL = 1,
+};
+
+struct event_header
+{
+ EventType type;
+ uint64_t size;
+};
+
+struct event_kernel
+{
+ struct event_header header;
+ uint64_t start;
+ uint64_t end;
+ char name[1];
+};
+
+} // namespace cupti
+} // namespace lo2s
diff --git a/include/lo2s/cupti/reader.hpp b/include/lo2s/cupti/reader.hpp
new file mode 100644
index 00000000..5c474b67
--- /dev/null
+++ b/include/lo2s/cupti/reader.hpp
@@ -0,0 +1,98 @@
+/*
+ * This file is part of the lo2s software.
+ * Linux OTF2 sampling
+ *
+ * Copyright (c) 2016,
+ * Technische Universitaet Dresden, Germany
+ *
+ * lo2s is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lo2s is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with lo2s. If not, see .
+ */
+
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+extern "C"
+{
+#include
+#include
+}
+
+namespace lo2s
+{
+namespace cupti
+{
+
+class Reader
+{
+public:
+ Reader(trace::Trace& trace, Process process)
+ : process_(process), trace_(trace), time_converter_(perf::time::Converter::instance()),
+ ringbuf_reader_("cupti", process.as_pid_t(), true, config().nvidia_ringbuf_size),
+ timer_fd_(timerfd_from_ns(config().userspace_read_interval)),
+ executable_name_(get_process_exe(process))
+ {
+ }
+
+ void read()
+ {
+ struct event_header* header = nullptr;
+
+ while ((header = reinterpret_cast(
+ ringbuf_reader_.get(sizeof(struct event_header)))) != nullptr)
+ {
+ if (header->type == EventType::CUPTI_KERNEL)
+ {
+ struct event_kernel* kernel =
+ reinterpret_cast(ringbuf_reader_.get(header->size));
+
+ auto& writer = trace_.cuda_writer(Thread(process_.as_thread()));
+
+ std::string kernel_name = kernel->name;
+ auto& cu_cctx = trace_.cuda_calling_context(executable_name_, kernel_name);
+
+ writer.write_calling_context_enter(time_converter_(kernel->start), cu_cctx.ref(),
+ 2);
+ writer.write_calling_context_leave(time_converter_(kernel->end), cu_cctx.ref());
+ }
+
+ ringbuf_reader_.pop(header->size);
+ }
+ }
+
+ int fd()
+ {
+ return timer_fd_;
+ }
+
+private:
+ Process process_;
+ trace::Trace& trace_;
+ perf::time::Converter& time_converter_;
+ RingBufReader ringbuf_reader_;
+ int timer_fd_;
+ std::string executable_name_;
+};
+} // namespace cupti
+} // namespace lo2s
diff --git a/include/lo2s/measurement_scope.hpp b/include/lo2s/measurement_scope.hpp
index bc3a2979..f3841815 100644
--- a/include/lo2s/measurement_scope.hpp
+++ b/include/lo2s/measurement_scope.hpp
@@ -33,6 +33,7 @@ enum class MeasurementScopeType
NEC_METRIC,
BIO,
SYSCALL,
+ CUDA,
UNKNOWN
};
@@ -79,6 +80,11 @@ struct MeasurementScope
return { MeasurementScopeType::SYSCALL, s };
}
+ static MeasurementScope cuda(ExecutionScope s)
+ {
+ return { MeasurementScopeType::CUDA, s };
+ }
+
friend bool operator==(const MeasurementScope& lhs, const MeasurementScope& rhs)
{
return (lhs.scope == rhs.scope) && lhs.type == rhs.type;
@@ -111,6 +117,8 @@ struct MeasurementScope
return fmt::format("block layer I/O events for {}", scope.name());
case MeasurementScopeType::SYSCALL:
return fmt::format("syscall events for {}", scope.name());
+ case lo2s::MeasurementScopeType::CUDA:
+ return fmt::format("cuda kernel events for {}", scope.name());
default:
throw new std::runtime_error("Unknown ExecutionScopeType!");
}
diff --git a/include/lo2s/monitor/abstract_process_monitor.hpp b/include/lo2s/monitor/abstract_process_monitor.hpp
index bcc93a79..18bb6859 100644
--- a/include/lo2s/monitor/abstract_process_monitor.hpp
+++ b/include/lo2s/monitor/abstract_process_monitor.hpp
@@ -41,7 +41,7 @@ class AbstractProcessMonitor
virtual void insert_process(Process parent, Process process, std::string proc_name,
bool spawn = false) = 0;
virtual void insert_thread(Process process, Thread thread, std::string name = "",
- bool spawn = false) = 0;
+ bool spawn = false, bool is_process = false) = 0;
virtual void exit_thread(Thread thread) = 0;
diff --git a/include/lo2s/monitor/process_monitor.hpp b/include/lo2s/monitor/process_monitor.hpp
index d5463c63..609f5d42 100644
--- a/include/lo2s/monitor/process_monitor.hpp
+++ b/include/lo2s/monitor/process_monitor.hpp
@@ -45,7 +45,8 @@ class ProcessMonitor : public AbstractProcessMonitor, public MainMonitor
~ProcessMonitor();
void insert_process(Process parent, Process child, std::string proc_name,
bool spawn = false) override;
- void insert_thread(Process parent, Thread child, std::string name, bool spawn = false) override;
+ void insert_thread(Process parent, Thread child, std::string name, bool spawn = false,
+ bool is_process = false) override;
void exit_thread(Thread thread) override;
diff --git a/include/lo2s/monitor/scope_monitor.hpp b/include/lo2s/monitor/scope_monitor.hpp
index f68cd6e7..c809d64c 100644
--- a/include/lo2s/monitor/scope_monitor.hpp
+++ b/include/lo2s/monitor/scope_monitor.hpp
@@ -24,14 +24,15 @@
#include
#include
+#include
#include
#include
-
#include
#include
#include
#include
+#include
#include
#include
@@ -50,7 +51,8 @@ namespace monitor
class ScopeMonitor : public PollMonitor
{
public:
- ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec);
+ ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec,
+ bool is_process = false);
void initialize_thread() override;
void finalize_thread() override;
@@ -74,6 +76,7 @@ class ScopeMonitor : public PollMonitor
std::unique_ptr sample_writer_;
std::unique_ptr group_counter_writer_;
std::unique_ptr userspace_counter_writer_;
+ std::unique_ptr cupti_reader_;
};
} // namespace monitor
} // namespace lo2s
diff --git a/include/lo2s/monitor/system_process_monitor.hpp b/include/lo2s/monitor/system_process_monitor.hpp
index 3162ecde..ed785718 100644
--- a/include/lo2s/monitor/system_process_monitor.hpp
+++ b/include/lo2s/monitor/system_process_monitor.hpp
@@ -46,8 +46,8 @@ class SystemProcessMonitor : public AbstractProcessMonitor
virtual void insert_process(Process parent, Process process, std::string proc_name,
bool spawn) override;
- virtual void insert_thread(Process process, Thread thread, std::string name,
- bool spawn) override;
+ virtual void insert_thread(Process process, Thread thread, std::string name, bool spawn,
+ bool is_process) override;
virtual void exit_thread(Thread thread) override;
diff --git a/include/lo2s/ringbuf.hpp b/include/lo2s/ringbuf.hpp
new file mode 100644
index 00000000..6d30e4d4
--- /dev/null
+++ b/include/lo2s/ringbuf.hpp
@@ -0,0 +1,222 @@
+/*
+ * This file is part of the lo2s software.
+ * Linux OTF2 sampling
+ *
+ * Copyright (c) 2024,
+ * Technische Universitaet Dresden, Germany
+ *
+ * lo2s is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lo2s is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with lo2s. If not, see .
+ */
+
+#pragma once
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+extern "C"
+{
+#include
+#include
+#include
+#include
+}
+
+namespace lo2s
+{
+
+// To resolve possible ringbuf format incompatibilities
+#define RINGBUF_VERSION 1
+
+struct ringbuf_header
+{
+ uint64_t version;
+ uint64_t size;
+ std::atomic_uint64_t head;
+ std::atomic_uint64_t tail;
+};
+
+class ShmRingbuf
+{
+public:
+ ShmRingbuf(std::string component, pid_t pid, bool create, size_t pages)
+ {
+ std::string filename = "/lo2s-" + component + "-" + std::to_string(pid);
+
+ fd_ = shm_open(filename.c_str(), create ? O_RDWR | O_CREAT | O_EXCL : O_RDWR, 0600);
+ if (fd_ == -1)
+ {
+ throw std::system_error(errno, std::system_category());
+ }
+
+ size_t pagesize = sysconf(_SC_PAGESIZE);
+ size_t ringbuf_size;
+
+ if (create)
+ {
+ ringbuf_size = pagesize * pages;
+ ftruncate(fd_, ringbuf_size + sysconf(_SC_PAGESIZE));
+ }
+ else
+ {
+ auto header_map = SharedMemory(fd_, sizeof(struct ringbuf_header), 0);
+ ringbuf_size = header_map.as()->size;
+ }
+
+ // To handle events that wrap around the ringbuffer, map it twice into virtual memory
+ // back-to-back. This way events that wrap around the ringbuffer can be read and written
+ // without noticing the wraparound:
+ //
+ // in physical memory: [ent|-----|ev]
+ //
+ // in virtual memory: [ent|-----|ev][ent----|ev]
+ //
+ // As there is no way to reserve a range of virtual memory, mmap()-ing two adjacent
+ // ring-buffer without races is tricky. We solve this problem by mmap()-ing an area twice
+ // the size of the ringbuffer and then overwriting the latter half of this mapping with
+ // another mapping of the ringbuffer using MMAP_FIXED. This way we only touch mappings we
+ // control. Also, put the ringbuffer header on a separate page to make life easier.
+
+ first_mapping_ = SharedMemory(fd_, ringbuf_size * 2 + pagesize, 0);
+
+ second_mapping_ = SharedMemory(fd_, ringbuf_size, pagesize,
+ first_mapping_.as() + ringbuf_size);
+
+ header_ = first_mapping_.as();
+ start_ = first_mapping_.as() + pagesize;
+
+ if (create)
+ {
+ header_->version = RINGBUF_VERSION;
+ header_->size = ringbuf_size;
+ header_->tail.store(0);
+ header_->head.store(0);
+ }
+ }
+
+ uint64_t head()
+ {
+ return header_->head.load();
+ }
+
+ uint64_t tail()
+ {
+ return header_->tail.load();
+ }
+
+ void head(uint64_t new_head)
+ {
+ return header_->head.store(new_head);
+ }
+
+ void tail(uint64_t new_tail)
+ {
+ return header_->tail.store(new_tail);
+ }
+
+ uint64_t ringbuf_size()
+ {
+ return header_->size;
+ }
+
+protected:
+ std::byte* start_;
+
+private:
+ struct ringbuf_header* header_;
+ int fd_;
+ SharedMemory first_mapping_, second_mapping_;
+};
+
+class RingBufWriter : public ShmRingbuf
+{
+public:
+ RingBufWriter(std::string component, pid_t pid, bool create, size_t pages = 0)
+ : ShmRingbuf(component, pid, create, pages)
+ {
+ }
+
+ std::byte* reserve(size_t size)
+ {
+ // No other reservation can be active!
+ assert(reserved_size_ == 0);
+
+ if (head() >= tail() && size >= tail() - head() + ringbuf_size())
+ {
+ return nullptr;
+ }
+ if (head() < tail() && size >= tail() - head())
+ {
+ return nullptr;
+ }
+
+ reserved_size_ = size;
+ return start_ + head();
+ }
+
+ void commit()
+ {
+ assert(reserved_size_ != 0);
+
+ head((head() + reserved_size_) % ringbuf_size());
+ reserved_size_ = 0;
+ }
+
+private:
+ size_t reserved_size_ = 0;
+};
+
+class RingBufReader : public ShmRingbuf
+{
+public:
+ RingBufReader(std::string component, pid_t pid, bool create, size_t pages = 0)
+ : ShmRingbuf(component, pid, create, pages)
+ {
+ }
+
+ std::byte* get(size_t size)
+ {
+ if (!can_be_loaded(size))
+ {
+ return nullptr;
+ }
+ return start_ + tail();
+ }
+
+ void pop(size_t size)
+ {
+ // Calling pop() without trying to get() data from the ringbuffer first is a error
+ assert(can_be_loaded(size));
+
+ tail((tail() + size) % ringbuf_size());
+ }
+
+private:
+ bool can_be_loaded(size_t size)
+ {
+ if (tail() <= head())
+ {
+ return tail() + size <= head();
+ }
+
+ return tail() + size <= head() + ringbuf_size();
+ }
+};
+} // namespace lo2s
diff --git a/include/lo2s/trace/reg_keys.hpp b/include/lo2s/trace/reg_keys.hpp
index ae0ba9d2..542f8945 100644
--- a/include/lo2s/trace/reg_keys.hpp
+++ b/include/lo2s/trace/reg_keys.hpp
@@ -231,8 +231,8 @@ struct Holder
template <>
struct Holder
{
- using type =
- otf2::lookup_definition_holder;
+ using type = otf2::lookup_definition_holder;
};
template <>
diff --git a/include/lo2s/trace/trace.hpp b/include/lo2s/trace/trace.hpp
index ad8e71ce..9075292e 100644
--- a/include/lo2s/trace/trace.hpp
+++ b/include/lo2s/trace/trace.hpp
@@ -19,6 +19,7 @@
* along with lo2s. If not, see .
*/
#pragma once
+#include "otf2xx/definition/calling_context.hpp"
#include
#include
#include
@@ -131,12 +132,16 @@ class Trace
otf2::definition::mapping_table merge_syscall_contexts(const std::set& used_syscalls);
otf2::writer::local& sample_writer(const ExecutionScope& scope);
+ otf2::writer::local& cuda_writer(const Thread& thread);
otf2::writer::local& metric_writer(const MeasurementScope& scope);
otf2::writer::local& syscall_writer(const Cpu& cpu);
otf2::writer::local& bio_writer(BlockDevice dev);
otf2::writer::local& create_metric_writer(const std::string& name);
otf2::writer::local& nec_writer(NecDevice device, const Thread& nec_thread);
+ otf2::definition::calling_context& cuda_calling_context(std::string& exe,
+ std::string& function);
+
otf2::definition::io_handle& block_io_handle(BlockDevice dev);
otf2::definition::metric_member
diff --git a/include/lo2s/util.hpp b/include/lo2s/util.hpp
index 2d85569f..efd3fca3 100644
--- a/include/lo2s/util.hpp
+++ b/include/lo2s/util.hpp
@@ -122,4 +122,6 @@ Thread gettid();
std::set parse_list(std::string list);
std::set parse_list_from_file(std::filesystem::path file);
+
+int timerfd_from_ns(std::chrono::nanoseconds duration);
} // namespace lo2s
diff --git a/man/lo2s.1.pod b/man/lo2s.1.pod
index 8ffdfbc6..50b819fb 100644
--- a/man/lo2s.1.pod
+++ b/man/lo2s.1.pod
@@ -27,6 +27,7 @@ S<[B<--metric-count> I | B<--metric-frequency> I]>
S<[B<-x> I]>
S<[B<-X>]>
S<[B<-s SYSCALL>]>
+S<[B<--accel ACCEL>]>
S<{ I | I }>
=item I := { I | B<--> I [I...] | B<-p> I }
@@ -381,6 +382,24 @@ Record measurements for each sensor found by L.
=back
+=head2 B options
+
+=over
+
+=item B<--accel> I
+
+Record activity events (instruction samples or kernel execution information) for the given accelerator. Usable accelerators are "nec" for NEC SX-Aurora and "nvidia" for NVidia CUDA accelerators.
+
+=item B<--nec-readout-interval> I
+
+Set the interval (in microseconds) between NEC SX-Aurora instruction samples.
+
+=item B<--nec-check-interval> I
+
+Set the interval (in milliseconds) between checks for new NEC SX-Aurora processes.
+
+=back
+
=head2 Arguments to options
=over
diff --git a/src/config.cpp b/src/config.cpp
index d4399d6d..55505876 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -164,7 +164,7 @@ void parse_program_options(int argc, const char** argv)
auto& x86_energy_options = parser.group("x86_energy options");
auto& sensors_options = parser.group("sensors options");
auto& io_options = parser.group("I/O recording options");
- auto& nec_options = parser.group("NEC SX-Aurora Tsubasa recording options");
+ auto& accel_options = parser.group("Accelerator options");
lo2s::Config config;
@@ -346,16 +346,42 @@ void parse_program_options(int argc, const char** argv)
io_options.toggle("block-io",
"Enable recording of block I/O events (requires access to debugfs)");
- nec_options.toggle("nec", "Enable NEC Vector Engine sampling");
- nec_options.option("nec-readout-interval", "NEC sampling interval")
+ std::vector accelerators;
+
+#ifdef HAVE_CUDA
+ accelerators.push_back("nvidia");
+#endif
+#ifdef HAVE_VEOSINFO
+ accelerators.push_back("nec");
+#endif
+
+ accel_options
+ .multi_option(
+ "accel",
+ fmt::format("Accelerator to record execution events for. Available accelerators: {}",
+ fmt::join(accelerators, ", ")))
+ .metavar("ACCEL")
+ .optional();
+
+ accel_options.option("nec-readout-interval", "Accelerator sampling interval")
.optional()
.metavar("USEC")
.default_value("1");
- nec_options.option("nec-check-interval", "The interval between checks for new VE processes")
+ accel_options.option("nec-check-interval", "The interval between checks for new VE processes")
.optional()
.metavar("MSEC")
.default_value("100");
+ accel_options.option("nvidia-injection-path", "path to the lo2s cupti injection library")
+ .optional()
+ .metavar("PATH")
+ .default_value(LO2S_CUDA_INJECTIONLIB_PATH);
+
+ accel_options.option("nvidia-ringbuf-size", "Size of the injection library ring-buffer")
+ .optional()
+ .metavar("BYTE")
+ .default_value("65536");
+
nitro::options::arguments arguments;
try
{
@@ -382,7 +408,10 @@ void parse_program_options(int argc, const char** argv)
config.use_x86_energy = arguments.given("x86-energy");
config.use_sensors = arguments.given("sensors");
config.use_block_io = arguments.given("block-io");
- config.use_nec = arguments.given("nec");
+
+#ifdef HAVE_CUDA
+ config.cuda_injectionlib_path = arguments.get("nvidia-injection-path");
+#endif
config.command = arguments.positionals();
if (arguments.given("help"))
@@ -514,6 +543,35 @@ void parse_program_options(int argc, const char** argv)
}
}
+ for (const auto& accel : arguments.get_all("accel"))
+ {
+ if (accel == "nec")
+ {
+#ifdef HAVE_VEOSINFO
+ config.use_nec = true;
+#else
+ std::cerr << "lo2s was built without support for NEC SX-Aurora sampling\n";
+ std::exit(EXIT_FAILURE);
+#endif
+ }
+ else if (accel == "nvidia")
+ {
+#ifdef HAVE_CUDA
+ config.use_nvidia = true;
+ config.nvidia_ringbuf_size = arguments.as("nvidia-ringbuf-size");
+#else
+ std::cerr << "lo2s was built without support for CUDA kernel recording\n";
+ std::exit(EXIT_FAILURE);
+#endif
+ }
+ else
+ {
+ std::cerr << "Unknown Accelerator " << accel << "!";
+ parser.usage();
+ std::exit(EXIT_FAILURE);
+ }
+ }
+
std::vector perf_group_events = arguments.get_all("metric-event");
std::vector perf_userspace_events = arguments.get_all("userspace-metric-event");
diff --git a/src/cupti/lib.cpp b/src/cupti/lib.cpp
new file mode 100644
index 00000000..86cc4a69
--- /dev/null
+++ b/src/cupti/lib.cpp
@@ -0,0 +1,202 @@
+/*
+ * This file is part of the lo2s software.
+ * Linux OTF2 sampling
+ *
+ * Copyright (c) 2016,
+ * Technische Universitaet Dresden, Germany
+ *
+ * lo2s is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lo2s is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with lo2s. If not, see .
+ */
+
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+extern "C"
+{
+#include
+#include
+#include
+}
+
+// Allocate 8 MiB every time CUPTI asks for more event memory
+constexpr size_t CUPTI_BUFFER_SIZE = 8 * 1024 * 1024;
+
+std::unique_ptr rb_writer = nullptr;
+CUpti_SubscriberHandle subscriber = nullptr;
+
+clockid_t clockid = CLOCK_MONOTONIC_RAW;
+
+static void atExitHandler(void)
+{
+ // Flush all remaining activity records
+ cuptiActivityFlushAll(1);
+}
+
+static void CUPTIAPI bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords)
+{
+ assert(buffer != nullptr && size != nullptr && maxNumRecords != nullptr);
+
+ *maxNumRecords = 0;
+ *size = CUPTI_BUFFER_SIZE;
+ *buffer = static_cast(malloc(*size));
+
+ if (*buffer == nullptr)
+ {
+ std::cerr << "Error: Out of memory.\n";
+ exit(-1);
+ }
+}
+
+static void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size,
+ size_t validSize)
+{
+ CUpti_Activity* record = nullptr;
+ while (cuptiActivityGetNextRecord(buffer, validSize, &record) == CUPTI_SUCCESS)
+ {
+ switch (record->kind)
+ {
+ case CUPTI_ACTIVITY_KIND_KERNEL:
+ case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
+ {
+ CUpti_ActivityKernel6* kernel = reinterpret_cast(record);
+
+ uint64_t name_len = strlen(kernel->name);
+
+ struct lo2s::cupti::event_kernel* ev =
+ reinterpret_cast(
+ rb_writer->reserve(sizeof(struct lo2s::cupti::event_kernel) + name_len));
+
+ if (ev == nullptr)
+ {
+ std::cerr
+ << "Ringbuffer full, dropping event. Try to increase --nvidia-ringbuf-size!"
+ << std::endl;
+ continue;
+ }
+
+ ev->header.type = lo2s::cupti::EventType::CUPTI_KERNEL;
+ ev->header.size = sizeof(struct lo2s::cupti::event_kernel) + name_len;
+ ev->start = kernel->start;
+ ev->end = kernel->end;
+ memcpy(ev->name, kernel->name, name_len + 1);
+
+ rb_writer->commit();
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ size_t dropped;
+ cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped);
+ if (dropped != 0)
+ {
+ std::cerr << "Dropped " << dropped << " activity records.\n";
+ }
+
+ free(buffer);
+}
+
+void CUPTIAPI callbackHandler(void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+ void* cbdata)
+{
+ const CUpti_CallbackData* cbInfo = (CUpti_CallbackData*)cbdata;
+
+ if (domain == CUPTI_CB_DOMAIN_DRIVER_API)
+ {
+ if (cbid == CUPTI_DRIVER_TRACE_CBID_cuProfilerStart)
+ {
+ if (cbInfo->callbackSite == CUPTI_API_EXIT)
+ {
+ cuptiActivityEnableContext(cbInfo->context, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+ }
+ }
+ else if (cbid == CUPTI_DRIVER_TRACE_CBID_cuProfilerStop)
+ {
+ if (cbInfo->callbackSite == CUPTI_API_ENTER)
+ {
+ cuptiActivityFlushAll(0);
+ cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020);
+
+ cuptiActivityDisableContext(cbInfo->context, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+ }
+ }
+ }
+
+ // Also flush on CUDA device reset
+ else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API)
+ {
+ if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020)
+ {
+ if (cbInfo->callbackSite == CUPTI_API_ENTER)
+ {
+ cuptiActivityFlushAll(0);
+ }
+ }
+ }
+}
+
+uint64_t timestampfunc()
+{
+ struct timespec ts;
+ clock_gettime(clockid, &ts);
+ uint64_t res = ts.tv_sec * 1000000000 + ts.tv_nsec;
+ return res;
+}
+
+extern "C" int InitializeInjection(void)
+{
+
+ std::string rb_size_str;
+ rb_writer = std::make_unique("cupti", getpid(), false);
+ char* clockid_str = getenv("LO2S_CLOCKID");
+
+ if (clockid_str != nullptr)
+ {
+ clockid = std::stoi(clockid_str);
+ }
+
+ // Register an atexit() handler for clean-up
+ atexit(&atExitHandler);
+
+ cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)callbackHandler, nullptr);
+
+ // Supply or own timestamp generation function. Save us the work of converting timestamps
+ cuptiActivityRegisterTimestampCallback(timestampfunc);
+
+ // Register CUDA API callbacks for us to attach to new CUDA contexts
+ cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+ CUPTI_DRIVER_TRACE_CBID_cuProfilerStart);
+ cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+ CUPTI_DRIVER_TRACE_CBID_cuProfilerStop);
+
+ cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+
+ // Register buffer callbacks. When cupti needs a new buffer for recording date, it calls
+ // bufferRequested. When the buffer is full, bufferCompleted is used to write the data to the
+ // lo2s ring-buffer
+ cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted);
+
+ return 1;
+}
diff --git a/src/monitor/poll_monitor.cpp b/src/monitor/poll_monitor.cpp
index d19effa5..03d0a6e4 100644
--- a/src/monitor/poll_monitor.cpp
+++ b/src/monitor/poll_monitor.cpp
@@ -22,12 +22,9 @@
#include
#include
#include
+#include
#include
-extern "C"
-{
-#include
-}
namespace lo2s
{
@@ -51,18 +48,9 @@ PollMonitor::PollMonitor(trace::Trace& trace, const std::string& name,
if (read_interval.count() != 0)
{
- tspec.it_value.tv_nsec = 1;
-
- tspec.it_interval.tv_sec =
- std::chrono::duration_cast(read_interval).count();
-
- tspec.it_interval.tv_nsec = (read_interval % std::chrono::seconds(1)).count();
-
- timer_pfd().fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+ timer_pfd().fd = timerfd_from_ns(read_interval);
timer_pfd().events = POLLIN;
timer_pfd().revents = 0;
-
- timerfd_settime(timer_pfd().fd, TFD_TIMER_ABSTIME, &tspec, NULL);
}
else
{
diff --git a/src/monitor/process_monitor.cpp b/src/monitor/process_monitor.cpp
index 3b7cbeee..f4d66047 100644
--- a/src/monitor/process_monitor.cpp
+++ b/src/monitor/process_monitor.cpp
@@ -38,10 +38,11 @@ void ProcessMonitor::insert_process(Process parent, Process process, std::string
bool spawn)
{
trace_.add_process(parent, process, proc_name);
- insert_thread(process, process.as_thread(), proc_name, spawn);
+ insert_thread(process, process.as_thread(), proc_name, spawn, true);
}
-void ProcessMonitor::insert_thread(Process process, Thread thread, std::string name, bool spawn)
+void ProcessMonitor::insert_thread(Process process, Thread thread, std::string name, bool spawn,
+ bool is_process)
{
trace_.add_thread(thread, name);
@@ -56,9 +57,9 @@ void ProcessMonitor::insert_thread(Process process, Thread thread, std::string n
{
try
{
- auto inserted =
- threads_.emplace(std::piecewise_construct, std::forward_as_tuple(thread),
- std::forward_as_tuple(ExecutionScope(thread), *this, spawn));
+ auto inserted = threads_.emplace(
+ std::piecewise_construct, std::forward_as_tuple(thread),
+ std::forward_as_tuple(ExecutionScope(thread), *this, spawn, is_process));
assert(inserted.second);
// actually start thread
inserted.first->second.start();
diff --git a/src/monitor/process_monitor_main.cpp b/src/monitor/process_monitor_main.cpp
index aa4a4819..536265b5 100644
--- a/src/monitor/process_monitor_main.cpp
+++ b/src/monitor/process_monitor_main.cpp
@@ -19,6 +19,7 @@
* along with lo2s. If not, see .
*/
+#include
#include
#include
@@ -26,6 +27,7 @@
#include
#include
+#include
#include
#include
#include
@@ -117,6 +119,21 @@ static void drop_privileges()
assert(getgid() != 0);
}
+std::vector to_vector_of_c_str(std::vector vec)
+{
+ std::vector res;
+ std::transform(vec.begin(), vec.end(), std::back_inserter(res),
+ [](const std::string& s)
+ {
+ char* pc = new char[s.size() + 1];
+ std::strcpy(pc, s.c_str());
+ return pc;
+ });
+ res.push_back(nullptr);
+
+ return res;
+}
+
[[noreturn]] static void run_command(const std::vector& command_and_args)
{
struct rlimit initial_rlimit = initial_rlimit_fd();
@@ -137,14 +154,24 @@ static void drop_privileges()
/* we need ptrace to get fork/clone/... */
ptrace(PTRACE_TRACEME, 0, NULL, NULL);
- std::vector tmp;
- std::transform(command_and_args.begin(), command_and_args.end(), std::back_inserter(tmp),
- [](const std::string& s) {
- char* pc = new char[s.size() + 1];
- std::strcpy(pc, s.c_str());
- return pc;
- });
- tmp.push_back(nullptr);
+ std::vector env;
+#ifdef HAVE_CUDA
+ if (config().use_nvidia)
+ {
+ env = { "CUDA_INJECTION64_PATH=" + config().cuda_injectionlib_path };
+
+ if (config().use_clockid)
+ {
+ env.push_back("LO2S_CLOCKID=" + std::to_string(config().clockid));
+ }
+ else
+ {
+ env.push_back("LO2S_CLOCKID=" + std::to_string(CLOCK_MONOTONIC_RAW));
+ }
+ }
+#endif
+ std::vector c_env = to_vector_of_c_str(env);
+ std::vector c_args = to_vector_of_c_str(command_and_args);
Log::debug() << "Execute the command: " << nitro::lang::join(command_and_args);
@@ -158,13 +185,19 @@ static void drop_privileges()
}
// run the application which should be sampled
- execvp(tmp[0], &tmp[0]);
+ execvpe(c_args[0], &c_args[0], &c_env[0]);
// should not be executed -> exec failed, let's clean up anyway.
- for (auto cp : tmp)
+ for (auto cp : c_args)
+ {
+ delete[] cp;
+ }
+
+ for (auto cp : c_env)
{
delete[] cp;
}
+
Log::error() << "Could not execute the command: " << nitro::lang::join(command_and_args);
throw_errno();
}
diff --git a/src/monitor/scope_monitor.cpp b/src/monitor/scope_monitor.cpp
index c97c3d1b..af9f6af9 100644
--- a/src/monitor/scope_monitor.cpp
+++ b/src/monitor/scope_monitor.cpp
@@ -28,9 +28,6 @@
#include
#include
-#include
-
-#include
extern "C"
{
@@ -42,7 +39,8 @@ namespace lo2s
namespace monitor
{
-ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec)
+ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec,
+ bool is_process)
: PollMonitor(parent.trace(), scope.name(), config().perf_read_interval), scope_(scope)
{
if (config().sampling || scope.is_cpu())
@@ -72,6 +70,13 @@ ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enabl
add_fd(userspace_counter_writer_->fd());
}
+ if (config().use_nvidia && is_process)
+ {
+ cupti_reader_ =
+ std::make_unique(parent.trace(), scope.as_thread().as_process());
+ add_fd(cupti_reader_->fd());
+ }
+
// note: start() can now be called
}
@@ -95,6 +100,11 @@ void ScopeMonitor::monitor(int fd)
try_pin_to_scope(scope_);
}
+ if (cupti_reader_ && (fd == cupti_reader_->fd() || fd == stop_pfd().fd))
+ {
+ cupti_reader_->read();
+ }
+
if (syscall_writer_ &&
(fd == timer_pfd().fd || fd == stop_pfd().fd || syscall_writer_->fd() == fd))
{
diff --git a/src/monitor/system_process_monitor.cpp b/src/monitor/system_process_monitor.cpp
index b87001c5..a86d07d6 100644
--- a/src/monitor/system_process_monitor.cpp
+++ b/src/monitor/system_process_monitor.cpp
@@ -34,7 +34,8 @@ void SystemProcessMonitor::insert_process([[maybe_unused]] Process parent,
}
void SystemProcessMonitor::insert_thread([[maybe_unused]] Process process, Thread thread,
- std::string name, [[maybe_unused]] bool spawn)
+ std::string name, [[maybe_unused]] bool spawn,
+ [[maybe_unused]] bool is_process)
{
// in system monitoring, we only need to track the threads spawned from the process lo2s spawned
// itself. Without this, these threads end up as "". Sad times.
diff --git a/src/perf/counter/userspace/reader.cpp b/src/perf/counter/userspace/reader.cpp
index 32b82f95..985397e3 100644
--- a/src/perf/counter/userspace/reader.cpp
+++ b/src/perf/counter/userspace/reader.cpp
@@ -31,7 +31,6 @@
extern "C"
{
-#include
#include
}
@@ -43,26 +42,14 @@ namespace counter
{
namespace userspace
{
-
template
Reader::Reader(ExecutionScope scope)
: counter_collection_(
CounterProvider::instance().collection_for(MeasurementScope::userspace_metric(scope))),
- counter_buffer_(counter_collection_.counters.size()), data_(counter_collection_.counters.size())
+ counter_buffer_(counter_collection_.counters.size()),
+ timer_fd_(timerfd_from_ns(config().userspace_read_interval)),
+ data_(counter_collection_.counters.size())
{
- struct itimerspec tspec;
- memset(&tspec, 0, sizeof(struct itimerspec));
- tspec.it_value.tv_nsec = 1;
-
- tspec.it_interval.tv_sec =
- std::chrono::duration_cast(config().userspace_read_interval).count();
-
- tspec.it_interval.tv_nsec =
- (config().userspace_read_interval % std::chrono::seconds(1)).count();
- timer_fd_ = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
-
- timerfd_settime(timer_fd_, TFD_TIMER_ABSTIME, &tspec, NULL);
-
for (auto& event : counter_collection_.counters)
{
counter_fds_.emplace_back(perf_event_description_open(scope, event, -1));
diff --git a/src/trace/trace.cpp b/src/trace/trace.cpp
index 46df1980..6e4b8dbc 100644
--- a/src/trace/trace.cpp
+++ b/src/trace/trace.cpp
@@ -19,7 +19,6 @@
* along with lo2s. If not, see .
*/
-#include "otf2xx/chrono/duration.hpp"
#include
#include
@@ -411,6 +410,21 @@ otf2::writer::local& Trace::sample_writer(const ExecutionScope& writer_scope)
return archive_(location(writer_scope));
}
+otf2::writer::local& Trace::cuda_writer(const Thread& thread)
+{
+ MeasurementScope scope = MeasurementScope::cuda(thread.as_scope());
+
+ const auto& cuda_location_group = registry_.emplace(
+ ByMeasurementScope(scope), intern(scope.name()), otf2::common::location_group_type::process,
+ system_tree_root_node_);
+
+ const auto& intern_location = registry_.emplace(
+ ByMeasurementScope(scope), intern(scope.name()), cuda_location_group,
+ otf2::definition::location::location_type::cpu_thread);
+
+ return archive_(intern_location);
+}
+
otf2::writer::local& Trace::nec_writer(NecDevice device, const Thread& nec_thread)
{
@@ -489,6 +503,15 @@ otf2::writer::local& Trace::create_metric_writer(const std::string& name)
return archive_(location);
}
+otf2::definition::calling_context& Trace::cuda_calling_context(std::string& file,
+ std::string& function)
+{
+ LineInfo info = LineInfo::for_function(file.c_str(), function.c_str(), 0, "");
+
+ return registry_.emplace(
+ ByLineInfo(info), intern_region(info), intern_scl(info));
+}
+
otf2::definition::io_handle& Trace::block_io_handle(BlockDevice dev)
{
diff --git a/src/util.cpp b/src/util.cpp
index ff4fc632..be3ac43b 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -27,6 +27,7 @@ extern "C"
#include
#include
#include
+#include
#include
#include
#include
@@ -411,4 +412,19 @@ void bump_rlimit_fd()
"resource limit.";
}
}
+
+int timerfd_from_ns(std::chrono::nanoseconds duration)
+{
+ int timerfd;
+ struct itimerspec tspec;
+ memset(&tspec, 0, sizeof(struct itimerspec));
+
+ tspec.it_interval.tv_sec = std::chrono::duration_cast(duration).count();
+ tspec.it_interval.tv_nsec = (duration % std::chrono::seconds(1)).count();
+
+ timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+
+ timerfd_settime(timerfd, TFD_TIMER_ABSTIME, &tspec, NULL);
+ return timerfd;
+}
} // namespace lo2s