diff --git a/labs/debugging-ebpf/debugging-ebpf.tex b/labs/debugging-ebpf/debugging-ebpf.tex index 7d53cbe006..550cab1d19 100644 --- a/labs/debugging-ebpf/debugging-ebpf.tex +++ b/labs/debugging-ebpf/debugging-ebpf.tex @@ -165,7 +165,13 @@ \section{Improving our program} As a final improvement, we will trace the parent PID as well to know who is starting any program. \begin{itemize} \item Edit your eBPF program to read the parent PID. This info can be captured by retrieving the current \code{struct task_struct}, and identifying the relevant fields. Check both Elixir for the layout of \code{struct task_struct}, and \manpage{bpf-helpers}{7} to learn how to get the current task. - \item We are using CO-RE definition for kernel data (through vmlinux.h), so we can not dereference directly a \code{struct task_struct} in our eBPF program, we must use helpers to retrieve struct fields. You can check \href{https://nakryiko.com/posts/bpf-core-reference-guide/#the-missing-manual}{this blog post from Andrii Nakryiko} to learn about such helpers. + \item We are using CO-RE definition for kernel data (through vmlinux.h), so + we can not dereference directly a \code{struct task_struct} in our eBPF + program, we must use helpers to retrieve struct fields. You can check + \href{https://nakryiko.com/posts/bpf-core-reference-guide/#the-missing-manual}{this + blog post from Andrii Nakryiko} to learn about such helpers. Also, you will + need to check \kstruct{task_struct} to know what field to extract to get the + parent PID. \item Update your userspace program to read and print the newly captured value \end{itemize} diff --git a/slides/debugging-system-wide-profiling/bpf_lifecycle.dia b/slides/debugging-system-wide-profiling/bpf_lifecycle.dia new file mode 100644 index 0000000000..61847f9d1f --- /dev/null +++ b/slides/debugging-system-wide-profiling/bpf_lifecycle.dia @@ -0,0 +1,1100 @@ + + + + + + + + + + + + + #A4# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #myprog.bpf.c# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #myprog.bpf.o# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #verifier# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #kernel# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #userspace# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #clang# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #bpf()# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #attach# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #program runs +on event# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #userspace tool# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #map_1# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #myprog# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #map_2# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/slides/debugging-system-wide-profiling/debugging-system-wide-profiling.tex b/slides/debugging-system-wide-profiling/debugging-system-wide-profiling.tex index 43f0d9b58b..00cf6fa4de 100644 --- a/slides/debugging-system-wide-profiling/debugging-system-wide-profiling.tex +++ b/slides/debugging-system-wide-profiling/debugging-system-wide-profiling.tex @@ -177,7 +177,7 @@ \subsection{perf} \begin{itemize} \item {\em perf} allows to create dynamic tracepoints on both kernel functions and user-space functions. - \item In order to be able to insert probes, \kconfig{CONFIG_KPROBE} must be + \item In order to be able to insert probes, \kconfig{CONFIG_KPROBES} must be enabled in the kernel. \begin{itemize} \item Note: {\em libelf} is required to compile {\em perf} with @@ -644,8 +644,6 @@ \subsection{ftrace and trace-cmd} ... \end{minted} \end{block} - - \end{frame} \begin{frame}[fragile] @@ -795,6 +793,192 @@ \subsection{ftrace and trace-cmd} \center\includegraphics[height=0.8\textheight]{slides/debugging-system-wide-profiling/kernelshark.png} \end{frame} +\setuplabframe +{System wide profiling} +{ + Profiling a system from userspace to kernel space + \begin{itemize} + \item Profiling with ftrace, uprobes and kernelshark + \item Profiling with perf + \end{itemize} +} + +\subsection{LTTng} + +\begin{frame} + \frametitle{{\em LTTng}} + \begin{columns} + \column{0.65\textwidth} + \begin{itemize} + \item LTTng is an open source tracing framework for Linux maintained by + the \href{https://www.efficios.com/}{EfficiOS} company. + \item LTTng allows understanding the interactions between the kernel and + applications (C, C++, Java, Python). + \begin{itemize} + \item Also expose a \code{/dev/lttng-logger} that can be used from any + application. + \end{itemize} + \item Tracepoints are associated with a payload (data). + \item LTTng is focused on low-overhead tracing. + \item Uses the Common Trace Format (so traces are readable with other + software like babeltrace or trace-compass) + \end{itemize} + \column{0.35\textwidth} + \includegraphics[height=0.3\textheight]{slides/debugging-system-wide-profiling/lttng-logo.jpg} + \end{columns} +\end{frame} + +\begin{frame} + \frametitle{Tracepoints with {\em LTTng} } + \begin{itemize} + \item LTTng works with a session daemon that receive all events from kernel + and userspace LTTng tracing components. + \item LTTng can use and trace the following instrumentation points: + \begin{itemize} + \item LTTng kernel tracepoints + \item kprobes and kretprobes + \item Linux kernel system calls + \item Linux user space probe + \item User space LTTng tracepoints + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Creating userspace tracepoints with {\em LTTng}} + \begin{itemize} + \item New userspace tracepoints can be defined using LTTng. + \item Tracepoints have multiple characteristics: + \begin{itemize} + \item A provider namespace + \item A name identifying the tracepoint + \item Parameters of various types (int, char *, etc) + \item Fields describing how to display the tracepoint parameters + (decimal, hexadecimal, etc) (see \href{https://lttng.org/man/3/lttng-ust/v2.13/}{LTTng-ust} manpage + for types) + \end{itemize} + \item Developpers must perform multiple operations to use UST tracepoint: + write a tracepoint provider (.h), write a tracepoint package (.c), build + the package, call the tracepoint in the traced application, and finally + build the application, linked with lttng-ust library and the package provider. + \item LTTng provides the \code{lttng-gen-tp} to ease all those steps, + allowing to only write a template (.tp) file. + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Defining a {\em LTTng} tracepoint (1/2)} + + \begin{itemize} + \item Tracepoint template (\code{hello_world-tp.tp}): + \begin{block}{} + \begin{minted}[fontsize=\tiny]{C} + LTTNG_UST_TRACEPOINT_EVENT( + // Tracepoint provider name + hello_world, + + // Tracepoint/event name + first_tp, + + // Tracepoint arguments (input) + LTTNG_UST_TP_ARGS( + char *, text + ), + + // Tracepoint/event fields (output) + LTTNG_UST_TP_FIELDS( + lttng_ust_field_string(message, text) + ) + ) + \end{minted} + \end{block} + \item \code{lttng-gen-tp} will take this template file and generate/build + all needed files (.h, .c and .o files) + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Defining a {\em LTTng} tracepoint (2/2)} + \begin{itemize} + \item Build tracepoint provider: + \end{itemize} + \begin{block}{} + \begin{minted}[fontsize=\tiny]{console} +$ lttng-gen-tp hello_world-tp.tp + \end{minted} + \end{block} + \begin{itemize} + \item Tracepoint usage (\code{hello_world.c}): + \end{itemize} + \begin{block}{} + \begin{minted}[fontsize=\tiny]{C} +#include +#include "hello-tp.h" + +int main(int argc, char *argv[]) +{ + lttng_ust_tracepoint(hello_world, my_first_tracepoint, 23, "hi there!"); + return 0; +} + \end{minted} + \end{block} + \begin{itemize} + \item Compilation: + \end{itemize} + \begin{block}{} + \begin{minted}[fontsize=\tiny]{console} +$ gcc hello_world.c hello_world-tp.o -llttng-ust -o hello_world + \end{minted} + \end{block} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Using {\em LTTng}} + \begin{block}{} + \begin{minted}[fontsize=\small]{console} +$ lttng create my-tracing-session --output=./my_traces +$ lttng list --kernel +$ lttng list --userspace +$ lttng enable-event --userspace hello_world:my_first_tracepoint +$ lttng enable-event --kernel --syscall open,close,write +$ lttng start +$ /* Run your application or do something */ +$ lttng destroy +$ babeltrace2 ./my_traces + \end{minted} + \end{block} + \begin{itemize} + \item You can also use + \href{https://eclipse.dev/tracecompass/trace-compass}{trace-compass} + to display the traces in a GUI + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Remote tracing with {\em LTTng}} + \begin{itemize} + \item LTTng allows to record traces over the network. + \item Useful for embedded systems with limited storage capabilities. + \item On the remote computer, run \code{lttng-relayd} command + \end{itemize} + \begin{block}{} + \begin{minted}[fontsize=\small]{console} +$ lttng-relayd --output=${PWD}/traces + \end{minted} + \end{block} + \begin{itemize} + \item Then on the target, at session creation, use the \code{--set-url} + \end{itemize} + \begin{block}{} + \begin{minted}[fontsize=\small]{console} +$ lttng create my-session --set-url=net://remote-system + \end{minted} + \end{block} + \begin{itemize} + \item Traces will then be recorded directly on the remote computer. + \end{itemize} +\end{frame} + \subsection{eBPF} \begin{frame}{The ancestor: Berkeley Packet filter} @@ -845,55 +1029,233 @@ \subsection{eBPF} \begin{frame} \frametitle{eBPF (1/2)} \begin{itemize} - \item \href{https://ebpf.io/}{eBPF} framework in the kernel allows running - user-written BPF programs within the kernel in a safe and efficient - way (Added in kernel 3.15) - \item Execution is event-driven and can be hooked using Kprobes, tracepoints - and other methods of tracing - \item Executes complex actions and reports data to userspace for - events that took place in the kernel. - \item Used to hook into various places of the kernel: VFS, Network stack, - syscalls, load balancing, security, etc - \end{itemize} - \center\includegraphics[height=0.2\textheight]{slides/debugging-linux-application-stack/logo_ebpf.png}\\ + \item \href{https://ebpf.io/}{eBPF} is a new framework allowing to run + small user programs directly in the kernel, in a safe and efficient way. It + has been added in kernel 3.18 but it is still evolving and receiving + updates frequently. + \item eBPF programs can capture and expose kernel data to userspace, and + also alter kernel behavior based on some user-defined rules. + \item eBPF is event-driven: an eBPF program is triggered and executed on a + specific kernel event + \item A major benefit from eBPF is the possibility to reprogram the kernel + behavior, without performing kernel development: + \begin{itemize} + \item no risk of crashing the kernel because of bugs + \item faster development cycles to get a new feature ready + \end{itemize} + \end{itemize} + \center\includegraphics[height=0.2\textheight]{slides/debugging-linux-application-stack/logo_ebpf.png}\\ \tiny Image credits: \url{https://ebpf.io/} \end{frame} \begin{frame} \frametitle{eBPF (2/2)} \begin{itemize} - \item Programs are loaded using the \code{bpf()} system call - (\manpage{bpf}{2}) and then verified by the kernel BPF verifier before - being executed. + \item The most notable eBPF features are: + \begin{itemize} + \item A new instruction set, interpreter and verifier + \item A wide variety of "attach" locations, allowing to hook programs + almost anywhere in the kernel + \item dedicated data structures called "maps", to exchange data between + multiple eBPF programs or between programs and userspace + \item A dedicated \code{bpf()} syscall to manipulate eBPF programs and data + \item plenty of (kernel) helper functions accessible from eBPF programs. + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{eBPF program lifecycle} + \begin{center} + \includegraphics[height=0.8\textheight]{slides/debugging-system-wide-profiling/bpf_lifecycle.pdf} + \end{center} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Kernel configuration for eBPF} + \begin{itemize} + \item \kconfig{CONFIG_NET} to enable eBPF subsystem + \item \kconfig{CONFIG_BPF_SYSCALL} to enable the \code{bpf()} syscall + \item \kconfig{CONFIG_BPF_JIT} to enable JIT on programs and so increase performance + \item \kconfig{CONFIG_BPF_JIT_ALWAYS_ON} to force JIT + \item \kconfigval{CONFIG_BPF_UNPRIV_DEFAULT_OFF}{n} in \textbf{development} to + allow eBPF usage without root + \item You may then want to enable more general features to "unlock" + specific hooking locations: \begin{itemize} - \item Check of privileges to execute BPF program - \item Verifies that the BPF program always runs to completion and does not - loop forever + \item \kconfig{CONFIG_KPROBES} to allow hooking programs on kprobes + \item \kconfig{CONFIG_TRACING} to allow hooking programs on kernel tracepoints + \item \kconfig{CONFIG_NET_CLS_BPF} to write packets classifiers + \item \kconfig{CONFIG_CGROUP_BPF} to attach programs on cgroups hooks \end{itemize} - \item Almost all architectures have a BPF JIT support which allows - translating the BPF format into native CPU instruction, thus being - (almost) as fast as natively compiled code - \item BPF programs can return values in maps of various types (hash tables, - arrays, etc) which allows sharing data between user-space, eBPF - programs and kernel space. - \item Only some functions (called helpers) can be called in eBPF programs. - \item eBPF programs are attached to events (invoked on trigger). + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{eBPF ISA} + \begin{itemize} + \item eBPF is a "virtual" ISA, defining its own set of instructions: load + and store instruction, arithmetic instructions, jump instructions,etc + \item It also defines a set of 10 64-bits wide registers as well as a + calling convention: + \begin{itemize} + \item \code{R0}: return value from functions and BPF program + \item \code{R1, R2, R3, R4, R5}: function arguments + \item \code{R6, R7, R8, R9}: callee-saved registers + \item \code{R10}: stack pointer + \end{itemize} + \end{itemize} + \begin{block}{} + \begin{minted}[fontsize=\scriptsize]{console} +; bpf_printk("Hello %s\n", "World"); + 0: r1 = 0x0 ll + 2: r2 = 0xa + 3: r3 = 0x0 ll + 5: call 0x6 +; return 0; + 6: r0 = 0x0 + 7: exit + \end{minted} + \end{block} +\end{frame} + +\begin{frame}[fragile] + \frametitle{The eBPF verifier} + \begin{itemize} + \item When loaded into the kernel, a program must first be validated by the + eBPF verifier. + \item The verifier is a complex piece of software which checks eBPF + programs against a set of rules to ensure that running those may not + compromise the whole kernel. For example: + \begin{itemize} + \item a program must always return and so not contain paths which could + make them "infinite" (e.g: no infinite loop) + \item a program must make sure that a pointer is valid before + dereferencing it + \item a program can not access arbitrary memory addresses, it must use + passed context and available helpers + \end{itemize} + \item If a program violates one of the verifier rules, it will be rejected. + \item Despite the presence of the verifier, you still need to be careful when + writing programs ! eBPF programs run with preemption enabled (but CPU + migration disabled), so they can still suffer from concurrency issues + \begin{itemize} + \item Hopefully there are some mechanisms and helpers to avoid those isses, + like per-cpu maps types. + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Program types and attach points} + \begin{itemize} + \item There are different "types" of places to which a program can be + hooked + \begin{itemize} + \item an arbitrary kprobe + \item a kernel-defined static tracepoint + \item a specific perf event + \item throughout the network stack + \item and a lot more, see \ksym{bpf_attach_type} + \end{itemize} + \item A specific attach-point type can only be hooked with a set of + specific program types, see \ksym{bpf_prog_type} and + \kdochtml{bpf/libbpf/program_types}. + \item The program type then defines the data passed to an eBPF program as + input when it is invoked. For example: + \begin{itemize} + \item A \code{BPF_PROG_TYPE_TRACEPOINT} program will receive a structure + containing all data returned to userspace by the targeted tracepoint. + \item A \code{BPF_PROG_TYPE_SCHED_CLS} program (used to implement packets + classifiers) will receive a \kstruct{__sk_buff}, the kernel + representation of a socket buffer. + \item You can learn about the context passed to any program type by + checking \kfile{include/linux/bpf_types.h} + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{eBPF maps} + \begin{itemize} + \item eBPF programs exchange data with userspace or other programs through + maps of different nature: + \begin{itemize} + \item \code{BPF_MAP_TYPE_ARRAY}: generic array storage. Can be + differentiated per cpu + \item \code{BPF_MAP_TYPE_HASH}: a storage composed of key-value pairs. + Keys can be of different types: \code{__u32}, a device type, an ip address... + \item \code{BPF_MAP_TYPE_QUEUE}: a FIFO-type queue + \item \code{BPF_MAP_TYPE_CGROUP_STORAGE}: a specific hash map keyed by a + cgroup id. There are other types of maps specific to other object types + (inodes, tasks, sockets, etc) + \item etc... + \end{itemize} + \item For basic data, it is easier and more efficient to directly use eBPF + global variables (no syscalls involved, contrary to maps) + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{The \code{bpf()} syscall} + \begin{itemize} + \item The kernel exposes a \code{bpf()} syscall to allow interacting with the + eBPF subsystem + \item The syscall takes a set of subcommands, and depending on the + subcommand, some specific data: + \begin{itemize} + \item \ksym{BPF_PROG_LOAD} to load a bpf program + \item \ksym{BPF_MAP_CREATE} to allocate maps to be used by a program + \item \ksym{BPF_MAP_LOOKUP_ELEM} to search for an entry in a map + \item \ksym{BPF_MAP_UPDATE_ELEM} to update an entry in a map + \item etc + \end{itemize} + \item The syscall works with file descriptors pointing to eBPF resources. + Those resources (program, maps, links, etc) remain valid while there is at least + one program holding a valid file descriptor to it. Those are automatically cleaned + once there are no user left. + \item For more details, see \manpage{bpf}{2} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Writing eBPF programs} \begin{itemize} - \item eBPF programs can be written in (restricted) C and are compiled - using clang compiler - \item BCC (BPF Compiler Collection) provides a toolkit to write BPF - programs more easily using C language (also provides LUA and Python - front-ends) + \item eBPF programs can either be written directly in raw eBPF assembly or in + higher level languages (e.g: C or rust), and are compiled using the clang + compiler. + \item The kernel provides some helpers that can be called from an eBPF program: \begin{itemize} - \item Allows to write tracing and profiling program easily + \item \code{bpf_trace_printk} Emits a log to the trace buffer + \item \code{bpf_map_{lookup,update,delete}_elem} Manipulates maps + \item \code{bpf_probe_{read,write}[_user]} Safely read/write data from/to kernel or userspace + \item \code{bpf_get_current_pid_tgid} Returns current Process ID and Thread group ID + \item \code{bpf_get_current_uid_gid} Returns current User ID and Group ID + \item \code{bpf_get_current_comm} Returns the name of the executable running in the + current task + \item \code{bpf_get_current_task} Returns the current \kstruct{task_struct} + \item Many other helpers are available, see \manpage{bpf-helpers}{7} + \end{itemize} + \item Kernel also exposes kfuncs (see \kdochtml{bpf/kfuncs}), but contrary + to bpf-helpers, those do not belong to the kernel stable interface. + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Manipulating eBPF program} + \begin{itemize} + \item There are different ways to build, load and manipulate eBPF programs: + \begin{itemize} + \item One way is to write an eBPF program, build it with clang, and then load it, + attach it and read data from it with bare \code{bpf()} calls in a custom + userspace program + \item One can also use \code{bpftool} on the built ebpf program to + manipulate it (load, attach, read maps, etc), without writing any userspace tool + \item Or we can write our own eBPF tool thanks to some intermediate libraries which handle most of the + hard work, like libbpf + \item We can also use specialized frameworks like BCC or bpftrace to really + get all operations (bpf program build included) handled \end{itemize} - \item {\em bpftrace} is a high level language allowing to easily write tracing - functions \end{itemize} \end{frame} @@ -917,7 +1279,7 @@ \subsection{eBPF} \end{itemize} \column{0.25\textwidth} \vspace{0.5cm} - \includegraphics[height=0.2\textheight]{slides/debugging-linux-application-stack/logo_bcc.png}\\ + \includegraphics[height=0.2\textheight]{slides/debugging-linux-application-stack/logo_bcc.png}\\ \tiny Image credits: \url{https://github.com/iovisor/bcc} \end{columns} \end{frame} @@ -963,8 +1325,25 @@ \subsection{eBPF} \begin{frame}[fragile] \frametitle{Using BCC with python} \begin{itemize} - \item BCC python support allows to easily write and hook C program for BPF - tracing. + \item BCC exposes a \code{bcc} module, and especially a \code{BPF} class + \item eBPF programs are written in C and stored either in external files + or directly in a python string. + \item When an instance of the \code{BPF} class is created and fed with the + program (either as string or file), it automatically builds, loads, and + possibly attaches the program + \item There are multiple ways to attach a program: + \begin{itemize} + \item By using a proper program name prefix, depending on the targeted + attach point (and so the attach step is performed automatically) + \item By explicitely calling the relevant attach method on the \code{BPF} + instance created earlier + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Using BCC with python} + \begin{itemize} \item Hook with a {\em kprobe} on the \code{clone()} system call and display \verb+"Hello, World!"+ each time it is called \end{itemize} @@ -987,317 +1366,404 @@ \subsection{eBPF} \end{frame} \begin{frame}[fragile] - \frametitle{bpftrace} - \begin{columns} - \column{0.75\textwidth} - \begin{itemize} - \item bpftrace is a high level tracing language allowing to write tracing - expressions easily (\url{https://bpftrace.org/}) - \item Also provide tools to trace various parts of the kernel - \begin{itemize} - \item Internally uses LLVM to compile script and BCC to interact with the BPF programs - \end{itemize} - \item bpftrace is inspired by awk and C, and predecessor tracers such as DTrace and SystemTap - \item Rich syntax documented at \url{https://github.com/iovisor/bpftrace/blob/master/docs/reference_guide.md} - \end{itemize} - \column{0.25\textwidth} - \vspace{0.5cm} - %% Source: https://commons.wikimedia.org/wiki/File:Elf-layout--en.svg - \includegraphics[height=0.2\textheight]{slides/debugging-system-wide-profiling/bpftrace.png}\\ - \tiny Image credits: \url{https://bpftrace.org/} - \end{columns} + \frametitle{libbpf} + \begin{itemize} + \item Instead of using a high level framework like BCC, one can use libbpf to + build custom tools with a finer control on every aspect of the program. + \item libbpf is a C-based library that aims to ease eBPF programming thanks + to the following features: + \begin{itemize} + \item userspace APIs to handle open/load/attach/teardown of bpf programs + \item userspace APIs to interact with attached programs + \item eBPF APIs to ease eBPF program writing + \end{itemize} + \item Packaged in many distributions and build systems (e.g.: Buildroot) + \item Learn more at \url{https://libbpf.readthedocs.io/en/latest/} + \end{itemize} \end{frame} \begin{frame}[fragile] - \frametitle{bpftrace tools} - - \begin{center} - \includegraphics[height=0.8\textheight]{slides/debugging-system-wide-profiling/bpftrace_tools_early2019.png}\\ - \tiny Image credits: \url{https://www.brendangregg.com/ebpf.html} - \end{center} + \frametitle{eBPF programming with libbpf (1/2)} + \begin{block}{\code{my_prog.bpf.c}} + \begin{minted}[fontsize=\tiny]{C} + #include + #include + #include + + #define TASK_COMM_LEN 16 + struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, __u64); + __uint(max_entries, 1); + } counter_map SEC(".maps"); + + struct sched_switch_args { + unsigned long long pad; + char prev_comm[TASK_COMM_LEN]; + int prev_pid; + int prev_prio; + long long prev_state; + char next_comm[TASK_COMM_LEN]; + int next_pid; + int next_prio; + }; + \end{minted} + \end{block} \end{frame} \begin{frame}[fragile] - \frametitle{Using bpftrace} - \begin{itemize} - \item Counting all syscalls per process: - \end{itemize} - \begin{block}{} - \begin{minted}[fontsize=\small]{console} -$ sudo bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }' -Attaching 1 probe... -^C -@[packagekitd]: 1 -@[GUsbEventThread]: 1 -@[gvfs-afc-volume]: 1 -@[ibus-extension-]: 4 + \frametitle{eBPF programming with libbpf (2/2)} + \begin{block}{\code{my_prog.bpf.c}} + \begin{minted}[fontsize=\tiny]{C} + SEC("tracepoint/sched/sched_switch") + int sched_tracer(struct sched_switch_args *ctx) + { + __u32 key = 0; + __u64 *counter; + char *file; + + char fmt[] = "Old task was %s, new task is %s\n"; + bpf_trace_printk(fmt, sizeof(fmt), ctx->prev_comm, ctx->next_comm); + + counter = bpf_map_lookup_elem(&counter_map, &key); + if(counter) { + *counter += 1; + bpf_map_update_elem(&counter_map, &key, counter, 0); + } + + return 0; + } + + char LICENSE[] SEC("license") = "Dual BSD/GPL"; \end{minted} \end{block} \end{frame} -\begin{frame} - \frametitle{eBPF: resources} - \begin{itemize} - \item A Beginner’s Guide to eBPF Programming - Liz Rice, 2020 - \begin{itemize} - \item Slides: \url{https://speakerdeck.com/lizrice/beginners-guide-to-ebpf} - \item Video: \url{https://www.youtube.com/watch?v=lrSExTfS-iQ} - \item Resources: \url{https://github.com/lizrice/ebpf-beginners} - \end{itemize} - \end{itemize} - \begin{center} - \includegraphics[height=0.6\textheight]{slides/debugging-system-wide-profiling/ebpf_liz_rice_2020.png} - \end{center} -\end{frame} - -\subsection{LTTng} -\begin{frame} - \frametitle{{\em LTTng} (1/2)} - \begin{columns} - \column{0.65\textwidth} - \begin{itemize} - \item LTTng is an open source tracing framework for Linux maintained by - the \href{https://www.efficios.com/}{EfficiOS} company. - \item LTTng allows understanding the interactions between the kernel and - applications (C, C++, Java, Python). - \begin{itemize} - \item Also expose a \code{/dev/lttng-logger} that can be used from any - application. - \end{itemize} - \item Tracepoints are associated with a payload (data). - \item LTTng is focused on low-overhead tracing. - \item LTTng provides a unified logging of all events (kernel/user). - \end{itemize} - \column{0.35\textwidth} - \includegraphics[height=0.3\textheight]{slides/debugging-system-wide-profiling/lttng-logo.jpg} - \end{columns} -\end{frame} - -\begin{frame} - \frametitle{{\em LTTng} (2/2)} - \begin{itemize} - \item Uses the \href{https://diamon.org/ctf/}{CTF} trace format (Common - Trace Format). - \item LTTng is made of multiple components: +\begin{frame}[fragile] + \frametitle{Building eBPF programs} + \begin{itemize} + \item An eBPF program written in C can be built into a loadable object + thanks to clang: + \begin{block}{} + \begin{minted}{console} + $ clang -target bpf -O2 -g -c my_prog.bpf.c -o my_prog.bpf.o + \end{minted} + \end{block} \begin{itemize} - \item LTTng-tools: Libraries and command-line interface to control tracing. - \item LTTng-modules: Linux kernel modules to instrument and trace the kernel. - \item LTTng-UST: Libraries and Java/Python packages to instrument and trace user applications. + \item The \code{-g} option allows to add debug information as well as + BTF information \end{itemize} - \item Already packaged by various distribution (debian, fedora, etc) and - present in Buildroot and openembedded-core. - \item Uses a single tool \code{lttng} to control tracing. - \item No need to recompile the kernel but a few options are need + \item GCC can be used too with recent versions \begin{itemize} - \item \kconfig{CONFIG_MODULES}, \kconfig{CONFIG_KALLSYMS}, \kconfig{CONFIG_HIGH_RES_TIMERS}, - \kconfig{CONFIG_TRACEPOINTS}, \kconfig{CONFIG_KPROBES} + \item the toolchain can be installed with the \code{gcc-bpf} package in + Debian/Ubuntu + \item it exposes the \code{bpf-unknown-none} target \end{itemize} + \item To easily manipulate this program with a userspace program based on libbpf, + we need "skeleton" APIs, which can be generated with to \code{bpftool} \end{itemize} \end{frame} \begin{frame}[fragile] - \frametitle{LTTng architecture} - \begin{center} - \includegraphics[height=0.8\textheight]{slides/debugging-system-wide-profiling/lttng_graph.png}\\ - \tiny Image credits: \url{https://lttng.org/} - \end{center} -\end{frame} - -\begin{frame} - \frametitle{Tracepoints with {\em LTTng} } + \frametitle{bpftool} \begin{itemize} - \item LTTng can use and trace the following instrumentation points: + \item \code{bpftool} is a command line tool allowing to interact with bpf + object files and the kernel to manipulate bpf programs: \begin{itemize} - \item LTTng kernel tracepoints - \item kprobes and kretprobes - \item Linux kernel system calls - \item Linux user space probe - \item User space LTTng tracepoints + \item Load programs into the kernel + \item List loaded programs + \item Dump program instructions, either as BPF code or JIT code + \item List loaded maps + \item Dump map content + \item Attach programs to hooks (so they can run) + \item etc \end{itemize} - \item LTTng works with a session daemon that receive all events from kernel - and userspace LTTng tracing components. - \item Session daemon should be started as daemon and the user should be in - the {\em tracing} group. + \item You may need to mount the bpf filesystem to be able to pin program + (needed to keep a program loaded after bpftool has finished running): + \begin{block}{} + \begin{minted}{console} + $ mount -t bpf none /sys/fs/bpf + \end{minted} + \end{block} \end{itemize} \end{frame} -\begin{frame} - \frametitle{Creating userspace tracepoints with {\em LTTng}} +\begin{frame}[fragile] + \frametitle{bpftool} \begin{itemize} - \item New userspace tracepoints can be defined using LTTng. - \item Tracepoints have multiple characteristics: - \begin{itemize} - \item A provider namespace - \item A name identifying the tracepoint - \item Parameters of various types (int, char *, etc) - \item Fields describing how to display the tracepoint parameters - (decimal, hexadecimal, etc) - \end{itemize} - \item Tracepoints are defined using a tracepoint provider header file - template and a tracepoint provider package file. - \begin{itemize} - \item The tracepoint provider header file template contains the definition - of the tracepoints. - \item The tracepoint provider package is the instantiation of the - tracepoints. - \end{itemize} - \item See \href{https://lttng.org/man/3/lttng-ust/v2.13/}{LTTng-ust} manpage - for types + \item List loaded programs \end{itemize} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Defining a {\em LTTng} tracepoint (1/2)} - + \begin{block}{} + \fontsize{10}{10}\selectfont + \begin{minted}{console} +$ bpftool prog +348: tracepoint name sched_tracer tag 3051de4551f07909 gpl +loaded_at 2024-08-06T15:43:11+0200 uid 0 +xlated 376B jited 215B memlock 4096B map_ids 146,148 +btf_id 545 + \end{minted} + \end{block} \begin{itemize} - \item Tracepoint provider header file (\code{hello_world-tp.h}): + \item Load and attach a program \end{itemize} \begin{block}{} - \begin{minted}[fontsize=\tiny]{C} -#undef LTTNG_UST_TRACEPOINT_PROVIDER -#define LTTNG_UST_TRACEPOINT_PROVIDER hello_world - -#undef LTTNG_UST_TRACEPOINT_INCLUDE -#define LTTNG_UST_TRACEPOINT_INCLUDE "./hello-tp.h" - -#if !defined(_HELLO_TP_H) || defined(LTTNG_UST_TRACEPOINT_HEADER_MULTI_READ) -#define _HELLO_TP_H - -#include - -LTTNG_UST_TRACEPOINT_EVENT( - hello_world, - my_first_tracepoint, - LTTNG_UST_TP_ARGS( - int, my_integer_arg, - char *, my_string_arg - ), - LTTNG_UST_TP_FIELDS( - lttng_ust_field_integer(int, my_integer_field, my_integer_arg) - lttng_ust_field_string(my_string_field, my_string_arg) - ) -) -#endif /* _HELLO_TP_H */ - -#include - \end{minted} + \fontsize{10}{10}\selectfont + \begin{minted}{console} +$ mkdir /sys/fs/bpf/myprog +$ bpftool prog loadall trace_execve.bpf.o /sys/fs/bpf/myprog autoattach + \end{minted} \end{block} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Defining a {\em LTTng} tracepoint (2/2)} \begin{itemize} - \item Tracepoint provider package (\code{hello_world-tp.c}): + \item Unload a program \end{itemize} \begin{block}{} - \begin{minted}[fontsize=\tiny]{C} -#define LTTNG_UST_TRACEPOINT_CREATE_PROBES -#define LTTNG_UST_TRACEPOINT_DEFINE - -#include "hello-tp.h" - \end{minted} + \fontsize{10}{10}\selectfont + \begin{minted}{console} +$ rm -rf /sys/fs/bpf/myprog + \end{minted} \end{block} +\end{frame} +\begin{frame}[fragile] + \frametitle{bpftool} \begin{itemize} - \item Tracepoint usage (\code{hello_world.c}): + \item Dump a loaded program \end{itemize} \begin{block}{} - \begin{minted}[fontsize=\tiny]{C} -#include -#include "hello-tp.h" - -int main(int argc, char *argv[]) -{ - lttng_ust_tracepoint(hello_world, my_first_tracepoint, 23, "hi there!"); - return 0; -} - \end{minted} + \fontsize{8}{8}\selectfont + \begin{minted}{console} +$ bpftool prog dump xlated id 348 +int sched_tracer(struct sched_switch_args * ctx): +; int sched_tracer(struct sched_switch_args *ctx) + 0: (bf) r4 = r1 + 1: (b7) r1 = 0 +; __u32 key = 0; + 2: (63) *(u32 *)(r10 -4) = r1 +; char fmt[] = "Old task was %s, new task is %s\n"; + 3: (73) *(u8 *)(r10 -8) = r1 + 4: (18) r1 = 0xa7325207369206b + 6: (7b) *(u64 *)(r10 -16) = r1 + 7: (18) r1 = 0x7361742077656e20 +[...] + \end{minted} \end{block} \begin{itemize} - \item Compilation: + \item Dump eBPF program logs \end{itemize} \begin{block}{} - \begin{minted}[fontsize=\tiny]{console} -$ gcc hello_world.c hello_world-tp.c -llttng-ust -o hello_world - \end{minted} + \fontsize{6}{6}\selectfont + \begin{minted}{console} +$ bpftool prog tracelog +kworker/u80:0-11 [013] d..41 1796.003605: bpf_trace_printk: Old task was kworker/u80:0, new task is swapper/13 +-0 [013] d..41 1796.003609: bpf_trace_printk: Old task was swapper/13, new task is kworker/u80:0 +sudo-18640 [010] d..41 1796.003613: bpf_trace_printk: Old task was sudo, new task is swapper/10 +-0 [010] d..41 1796.003617: bpf_trace_printk: Old task was swapper/10, new task is sudo +[...] + \end{minted} \end{block} \end{frame} \begin{frame}[fragile] - \frametitle{Generating tracepoints using \code{lttng-gen-tp}} + \frametitle{bpftool} \begin{itemize} - \item Writing both the \code{.h} and \code{.c} boilerplate can be avoided - using \code{lttng-gen-tp}. - \item \code{lttng-gen-tp} takes a template file (\code{.tp}) as input and will - generate both the provider header and package files (\code{.h}, - \code{.c} and \code{.o} files): + \item List created maps \end{itemize} \begin{block}{} - \begin{minted}[fontsize=\tiny]{C} - LTTNG_UST_TRACEPOINT_EVENT( - // Tracepoint provider name - hello_world, - - // Tracepoint/event name - first_tp, - - // Tracepoint arguments (input) - LTTNG_UST_TP_ARGS( - char *, text - ), - - // Tracepoint/event fields (output) - LTTNG_UST_TP_FIELDS( - lttng_ust_field_string(message, text) - ) -) - \end{minted} + \fontsize{9}{9}\selectfont + \begin{minted}{console} +$ bpftool map +80: array name counter_map flags 0x0 + key 4B value 8B max_entries 1 memlock 256B + btf_id 421 +82: array name .rodata.str1.1 flags 0x80 + key 4B value 33B max_entries 1 memlock 288B + frozen +96: array name libbpf_global flags 0x0 + key 4B value 32B max_entries 1 memlock 280B +[...] + \end{minted} \end{block} -\end{frame} - -\begin{frame}[fragile] - \frametitle{Using {\em LTTng}} + \begin{itemize} + \item Show a map content + \end{itemize} \begin{block}{} - \begin{minted}[fontsize=\small]{console} -$ lttng create my-tracing-session --output=./my_traces -$ lttng list --kernel -$ lttng list --userspace -$ lttng enable-event --userspace hello_world:my_first_tracepoint -$ lttng enable-event --kernel --syscall open,close,write -$ lttng start -$ /* Run your application or do something */ -$ lttng destroy -$ babeltrace2 ./my_traces - \end{minted} + \fontsize{9}{9}\selectfont + \begin{minted}{console} +$ sudo bpftool map dump id 80 +[{ + "key": 0, + "value": 4877514 + } +] + \end{minted} \end{block} \end{frame} \begin{frame}[fragile] - \frametitle{Remote tracing with {\em LTTng}} + \frametitle{bpftool} \begin{itemize} - \item LTTng allows to record traces over the network. - \item Useful for embedded systems with limited storage capabilities. - \item On the remote computer, run \code{lttng-relayd} command + \item Generate libbpf APIs to manipulate a program \end{itemize} \begin{block}{} - \begin{minted}[fontsize=\small]{console} -$ lttng-relayd --output=${PWD}/traces - \end{minted} + \fontsize{9}{9}\selectfont + \begin{minted}{console} +$ bpftool gen skeleton trace_execve.bpf.o name trace_execve > trace_execve.skel.h + \end{minted} \end{block} \begin{itemize} - \item Then on the target, at session creation, use the \code{--set-url} + \item We can then write our userspace program and benefit from high level + APIs to manipulate our eBPF program: + \begin{itemize} + \item instantiation of a global context object which will have references + to all of our programs, maps, links, etc + \item loading/attaching/unloading of our programs + \item eBPF program directly embedded in the generated header as a byte + array + \end{itemize} \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Userspace code with libbpf} \begin{block}{} - \begin{minted}[fontsize=\small]{console} -$ lttng create my-session --set-url=net://remote-system - \end{minted} + \begin{minted}[fontsize=\tiny]{C} + #include + #include + #include + #include "trace_sched_switch.skel.h" + + int main(int argc, char *argv[]) + { + struct trace_sched_switch *skel; + int key = 0; + long counter = 0; + + skel = trace_sched_switch__open_and_load(); + if(!skel) + exit(EXIT_FAILURE); + if (trace_sched_switch__attach(skel)) { + trace_sched_switch__destroy(skel); + exit(EXIT_FAILURE); + } + + while(true) { + bpf_map__lookup_elem(skel->maps.counter_map, &key, sizeof(key), &counter, sizeof(counter), 0); + fprintf(stderr, "Scheduling switch count: %d\n", counter); + sleep(1); + } + + return 0; + } + \end{minted} \end{block} +\end{frame} + +\begin{frame} + \frametitle{eBPF programs portability (1/2)} + \begin{itemize} + \item Kernel internals, contrary to userspace APIs, do not expose stable APIs. + This means that an eBPF program manipulating some kernel data may not work + with another kernel version + \item The CO-RE (Compile Once - Run Everywhere) approach aims to solve this issue + and make programs portable between \textbf{kernel versions}. It relies on + the following features: + \begin{itemize} + \item your kernel must be built with + \kconfigval{CONFIG_DEBUG_INFO_BTF}{y} to have BTF data embedded. BTF is a + format similar to dwarf which encodes data layout and functions + signatures in an efficient way. + \item your eBPF compiler must be able to emit BTF relocations (both clang + and GCC are capable of this on recent versions, with the \code{-g} argument) + \item you need a BPF loader capable of processing BPF programs based on BTF data and + adjust accordingly data accesses: \code{libbpf} is the de-facto standard bpf + loader + \item you then need eBPF APIs to read/write to CO-RE relocatable + variables. libbpf provides such helpers, like \code{bpf_core_read} + \end{itemize} + \item To learn more, take a look at + \href{https://nakryiko.com/posts/bpf-core-reference-guide/}{Andrii + Nakryiko's CO-RE guide} + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{eBPF programs portability (2/2)} \begin{itemize} - \item Traces will then be recorded directly on the remote computer. + \item Despite CO-RE, you may still face different constraints on different + kernel versions, because of major features introduction or change, since + the eBPF subsystem keeps receiving frequent updates: + \begin{itemize} + \item eBPF tail calls (which allow a program to call a function ) have + been added in version 4.2, and allow to call another program only since + version 5.10 + \item eBPF spin locks have been added in version 5.1 to prevent + concurrent accesses to maps shared between cpus. + \item Different attach types keep being added, but possibly on different + kernel versions when it depends on the architecture: fentry/fexit attach + points have been added in kernel 5.5 for x86 but in 6.0 for arm32. + \item Any kind of loop (even bounded) was forbidden until version 5.3 + \item \code{CAP_BPF} capability, allowing a process to perform eBPF tasks, has + been added in version 5.8 + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}[fragile] +\frametitle{eBPF for tracing/profiling} + \begin{itemize} + \item eBPF is a very powerful framework to spy on kernel internals: thanks + to the wide variety of attach point, you can expose almost any kernel code path and data. + \item In the mean time, eBPF programs remain isolated from kernel code, + which makes it safe (compared to kernel development) and easy to use. + \item Thanks to the in-kernel interpreter and optimizations like JIT compilation, eBPF is very well + suited for tracing or profiling with low overhead, even in production + environments, while being very flexible. + \item This is why eBPF adoption level keeps growing for debugging, tracing + and profiling in the Linux ecosystem. As a few examples, we find eBPF usage in: + \begin{itemize} + \item tracing frameworks like \href{https://github.com/iovisor/bcc}{BCC} + and \href{https://github.com/bpftrace/bpftrace}{bpftrace} + \item network infrastructure components, like + \href{https://github.com/cilium/cilium}{Cilium} or \href{https://github.com/projectcalico/calico}{Calico} + \item network packet tracers, like + \href{https://github.com/cilium/pwru}{pwru} or + \href{https://github.com/feiskyer/dropwatch}{dropwatch} + \item And many more, check \href{https://ebpf.io/applications/}{ebpf.io} + for more examples + \end{itemize} \end{itemize} \end{frame} +\begin{frame}[fragile] + \frametitle{eBPF: resources} + \begin{itemize} + \item BCC tutorial: + \url{https://github.com/iovisor/bcc/blob/master/docs/tutorial_bcc_python_developer.md} + \item libbpf-bootsrap: \url{https://github.com/libbpf/libbpf-bootstrap} + \item A Beginner’s Guide to eBPF Programming - Liz Rice, 2020 + \begin{itemize} + \item Video: \url{https://www.youtube.com/watch?v=lrSExTfS-iQ} + \item Resources: \url{https://github.com/lizrice/ebpf-beginners} + \end{itemize} + \end{itemize} + \begin{center} + \includegraphics[height=0.4\textheight]{slides/debugging-system-wide-profiling/ebpf_liz_rice_2020.png} + \end{center} +\end{frame} + +\setuplabframe +{System wide profiling} +{ + Creating custom tracing tools with eBPF + \begin{itemize} + \item Tracing with BCC + \item Converting a BCC script to libbpf + \item Bringing advanced features to the tool + \end{itemize} +} + \subsection{Choosing the right tool} \begin{frame}[fragile] @@ -1315,16 +1781,8 @@ \subsection{Choosing the right tool} \item For "constant" load problems, snapshot tools works fine. \item For sporadic problems, record traces and analyze them. \end{itemize} + \item If you happen to have a complex setup that you often have to bring up, + it is likely a sign that you want to ease this setup with some custom tooling: + scripting, custom traces, eBPF, etc \end{itemize} \end{frame} - -\setuplabframe -{System wide profiling} -{ - Profiling a system from userspace to kernel space - \begin{itemize} - \item Profiling with ftrace, uprobes and kernelshark - \item Profiling with LTTng and trace-compass - \item Profiling with perf - \end{itemize} -} diff --git a/slides/debugging-system-wide-profiling/lttng_graph.png b/slides/debugging-system-wide-profiling/lttng_graph.png deleted file mode 100644 index d15947efa7..0000000000 Binary files a/slides/debugging-system-wide-profiling/lttng_graph.png and /dev/null differ