From 0b218706036046cf8933f9be0d8febcde1d81fc0 Mon Sep 17 00:00:00 2001
From: Cody Tapscott <topolarity@tapscott.me>
Date: Mon, 30 Sep 2024 20:22:10 +0000
Subject: [PATCH] Add `LinuxPerf` extension for branch + instruction counts

This updates the core BenchmarkTools types to include `instructions` and
`branches` fields. If the extension is not available, these are `NaN`.

No support is included for measuring overhead or making judgements based
on these fields, but Serialization, Statistics, etc. are all supported
with their usual functionality for Trial / TrialEstimate / etc.
---
 .github/workflows/CI.yml         |   8 +-
 Project.toml                     |  11 +-
 ext/LinuxPerfExt/LinuxPerfExt.jl |  24 +++++
 src/BenchmarkTools.jl            |   2 +
 src/execution.jl                 |  65 ++++++++++--
 src/groups.jl                    |   2 +
 src/parameters.jl                |   6 +-
 src/serialization.jl             |  20 ++++
 src/trials.jl                    | 169 ++++++++++++++++++++++++++++---
 test/ExecutionTests.jl           |  10 +-
 test/GroupsTests.jl              |  32 ++++--
 test/SerializationTests.jl       |  37 ++++++-
 test/TrialsTests.jl              |  88 +++++++++++++---
 test/runtests.jl                 |   8 ++
 14 files changed, 418 insertions(+), 64 deletions(-)
 create mode 100644 ext/LinuxPerfExt/LinuxPerfExt.jl

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index b22812a9..1e25821c 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - '1.10'
           - '1'
           - 'nightly'
         arch:
@@ -24,12 +24,6 @@ jobs:
         os:
           - ubuntu-latest
         include:
-          - version: '1.7'
-            arch: x64
-            os: ubuntu-20.04
-          - version: '1.8'
-            arch: x64
-            os: ubuntu-22.04
           - version: '1.9'
             arch: x64
             os: ubuntu-22.04
diff --git a/Project.toml b/Project.toml
index 24139691..f80c690f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,12 @@ Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
+[weakdeps]
+LinuxPerf = "b4c46c6c-4fb0-484d-a11a-41bc3392d094"
+
+[extensions]
+LinuxPerfExt = "LinuxPerf"
+
 [compat]
 Aqua = "0.8"
 Compat = ">= 4.11.0"
@@ -22,7 +28,8 @@ Profile = "<0.0.1, 1"
 Statistics = "<0.0.1, 1"
 Test = "<0.0.1, 1"
 UUIDs = "<0.0.1, 1"
-julia = "1.6"
+julia = "1.9"
+LinuxPerf = ">= 0.4"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
@@ -31,4 +38,4 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "JuliaFormatter", "Statistics", "Test"]
+test = ["Aqua", "JuliaFormatter", "Statistics", "Test", "LinuxPerf"]
diff --git a/ext/LinuxPerfExt/LinuxPerfExt.jl b/ext/LinuxPerfExt/LinuxPerfExt.jl
new file mode 100644
index 00000000..e0a9dd77
--- /dev/null
+++ b/ext/LinuxPerfExt/LinuxPerfExt.jl
@@ -0,0 +1,24 @@
+module LinuxPerfExt
+
+import BenchmarkTools: PerfInterface
+import LinuxPerf: LinuxPerf, PerfBench, EventGroup, EventType
+import LinuxPerf: enable!, disable!, enable_all!, disable_all!, close, read!
+
+function interface()
+    return PerfInterface(;
+        setup=() -> PerfBench(
+            0, [EventGroup([EventType(:hw, :instructions), EventType(:hw, :branches)])]
+        ),
+        start=(bench) -> enable_all!(),
+        stop=(bench) -> disable_all!(),
+        # start=(bench) -> enable!(bench),
+        # stop=(bench) -> disable!(bench),
+        teardown=(bench) -> close(bench),
+        read=(bench) -> let g = only(bench.groups)
+            (insts, branches) = read!(g.leader_io, Vector{UInt64}(undef, 5))
+            return (insts, branches)
+        end,
+    )
+end
+
+end
diff --git a/src/BenchmarkTools.jl b/src/BenchmarkTools.jl
index 37102cbe..8ac42347 100644
--- a/src/BenchmarkTools.jl
+++ b/src/BenchmarkTools.jl
@@ -25,6 +25,8 @@ export loadparams!
 include("trials.jl")
 
 export gctime,
+    instructions,
+    branches,
     memory,
     allocs,
     params,
diff --git a/src/execution.jl b/src/execution.jl
index a9c3e25b..a80a0dd8 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -506,6 +506,24 @@ macro benchmarkable(args...)
     end
 end
 
+struct PerfInterface
+    setup::Function
+    start::Function
+    stop::Function
+    read::Function
+    teardown::Function
+
+    function PerfInterface(;
+        setup=Returns(nothing),
+        start=Returns(nothing),
+        stop=Returns(nothing),
+        read=Returns((-1, -1)),
+        teardown=Returns(nothing),
+    )
+        return new(setup, start, stop, read, teardown)
+    end
+end
+
 # `eval` an expression that forcibly defines the specified benchmark at
 # top-level in order to allow transfer of locally-scoped variables into
 # benchmark scope.
@@ -553,6 +571,8 @@ function generate_benchmark_definition(
             end
         )
     end
+    ext = Base.get_extension(BenchmarkTools, :BenchmarkToolsLinuxPerfExt)
+    LinuxPerf = isnothing(ext) ? PerfInterface() : ext.interface()
     return Core.eval(
         eval_module,
         quote
@@ -563,17 +583,42 @@ function generate_benchmark_definition(
                 $(Expr(:tuple, quote_vars...)), __params::$BenchmarkTools.Parameters
             )
                 $(setup)
+                __perf_bench = $(LinuxPerf.setup)()
+                __gcdiff = nothing
+                __return_val = nothing
+                __sample_time::Int64 = 0
+                __sample_instructions::Int64 = 0
+                __sample_branches::Int64 = 0
                 __evals = __params.evals
-                __gc_start = Base.gc_num()
-                __start_time = time_ns()
-                __return_val = $(invocation)
-                for __iter in 2:__evals
-                    $(invocation)
+                try
+                    __gc_start = Base.gc_num()
+                    $(LinuxPerf.start)(__perf_bench)
+                    __start_time = time_ns()
+                    __return_val = $(invocation)
+                    for __iter in 2:__evals
+                        $(invocation)
+                    end
+                    __sample_time = time_ns() - __start_time
+                    $(LinuxPerf.stop)(__perf_bench)
+                    __gcdiff = Base.GC_Diff(Base.gc_num(), __gc_start)
+                    __sample_instructions, __sample_branches = $(LinuxPerf.read)(
+                        __perf_bench
+                    )
+                finally
+                    $(LinuxPerf.teardown)(__perf_bench)
+                    $(teardown)
                 end
-                __sample_time = time_ns() - __start_time
-                __gcdiff = Base.GC_Diff(Base.gc_num(), __gc_start)
-                $(teardown)
                 __time = max((__sample_time / __evals) - __params.overhead, 0.001)
+                __instructions = if (__sample_instructions == -1)
+                    NaN
+                else
+                    max((__sample_instructions / __evals) - __params.insts_overhead, 0.0)
+                end
+                __branches = if (__sample_branches == -1)
+                    NaN
+                else
+                    max((__sample_branches / __evals) - 0.0, 0.0)
+                end
                 __gctime = max((__gcdiff.total_time / __evals) - __params.overhead, 0.0)
                 __memory = Int(Base.fld(__gcdiff.allocd, __evals))
                 __allocs = Int(
@@ -585,7 +630,9 @@ function generate_benchmark_definition(
                         __evals,
                     ),
                 )
-                return __time, __gctime, __memory, __allocs, __return_val
+                return __time,
+                __instructions, __branches, __gctime, __memory, __allocs,
+                __return_val
             end
             $BenchmarkTools.Benchmark($(samplefunc), $(quote_vals), $(params))
         end,
diff --git a/src/groups.jl b/src/groups.jl
index c1022a80..7f1aafbd 100644
--- a/src/groups.jl
+++ b/src/groups.jl
@@ -113,6 +113,8 @@ Base.min(groups::BenchmarkGroup...) = mapvals(min, groups...)
 Base.max(groups::BenchmarkGroup...) = mapvals(max, groups...)
 
 Base.time(group::BenchmarkGroup) = mapvals(time, group)
+instructions(group::BenchmarkGroup) = mapvals(instructions, group)
+branches(group::BenchmarkGroup) = mapvals(branches, group)
 gctime(group::BenchmarkGroup) = mapvals(gctime, group)
 memory(group::BenchmarkGroup) = mapvals(memory, group)
 allocs(group::BenchmarkGroup) = mapvals(allocs, group)
diff --git a/src/parameters.jl b/src/parameters.jl
index ff1bc615..6591724b 100644
--- a/src/parameters.jl
+++ b/src/parameters.jl
@@ -109,7 +109,11 @@ end
 
 @noinline function overhead_sample(evals)
     start_time = time_ns()
-    for _ in 1:evals
+    try
+        for _ in 1:evals
+            nullfunc()
+        end
+    finally
         nullfunc()
     end
     sample_time = time_ns() - start_time
diff --git a/src/serialization.jl b/src/serialization.jl
index 7bec2c8d..577607af 100644
--- a/src/serialization.jl
+++ b/src/serialization.jl
@@ -55,6 +55,26 @@ function recover(x::Vector)
         else
             xsi = if fn == "evals_set" && !haskey(fields, fn)
                 false
+            elseif fn in ("instructions", "branches")
+                # JSON spec doesn't support NaN, so handle it specially here
+                if !haskey(fields, fn)
+                    if ft === Vector{Float64}
+                        Float64[NaN for _ in length(fields["time"])]
+                    elseif ft === Float64
+                        NaN
+                    else
+                        @assert false
+                    end
+                else
+                    if ft === Vector{Float64}
+                        Float64[
+                            elem === nothing ? NaN : convert(Float64, elem) for
+                            elem in fields[fn]
+                        ]
+                    else
+                        fields[fn] === nothing ? NaN : convert(ft, fields[fn])
+                    end
+                end
             elseif fn in ("seconds", "overhead", "time_tolerance", "memory_tolerance") &&
                 fields[fn] === nothing
                 # JSON spec doesn't support Inf
diff --git a/src/trials.jl b/src/trials.jl
index 67382813..9786b1fb 100644
--- a/src/trials.jl
+++ b/src/trials.jl
@@ -5,27 +5,53 @@
 mutable struct Trial
     params::Parameters
     times::Vector{Float64}
+    instructions::Vector{Float64}
+    branches::Vector{Float64}
     gctimes::Vector{Float64}
     memory::Int
     allocs::Int
 end
 
-Trial(params::Parameters) = Trial(params, Float64[], Float64[], typemax(Int), typemax(Int))
+function Trial(params::Parameters)
+    return Trial(
+        params, Float64[], Float64[], Float64[], Float64[], typemax(Int), typemax(Int)
+    )
+end
+
+function eq_including_nan(x::Float64, y::Float64)
+    return x === y
+end
+
+function eq_including_nan(x::Vector{Float64}, y::Vector{Float64})
+    return all(eq_including_nan.(x, y))
+end
 
 function Base.:(==)(a::Trial, b::Trial)
     return a.params == b.params &&
            a.times == b.times &&
+           eq_including_nan(a.instructions, b.instructions) &&
+           eq_including_nan(a.branches, b.branches) &&
            a.gctimes == b.gctimes &&
            a.memory == b.memory &&
            a.allocs == b.allocs
 end
 
 function Base.copy(t::Trial)
-    return Trial(copy(t.params), copy(t.times), copy(t.gctimes), t.memory, t.allocs)
+    return Trial(
+        copy(t.params),
+        copy(t.times),
+        copy(t.instructions),
+        copy(t.branches),
+        copy(t.gctimes),
+        t.memory,
+        t.allocs,
+    )
 end
 
-function Base.push!(t::Trial, time, gctime, memory, allocs)
+function Base.push!(t::Trial, time, instructions, branches, gctime, memory, allocs)
     push!(t.times, time)
+    push!(t.instructions, instructions)
+    push!(t.branches, branches)
     push!(t.gctimes, gctime)
     memory < t.memory && (t.memory = memory)
     allocs < t.allocs && (t.allocs = allocs)
@@ -34,20 +60,42 @@ end
 
 function Base.deleteat!(t::Trial, i)
     deleteat!(t.times, i)
+    deleteat!(t.instructions, i)
+    deleteat!(t.branches, i)
     deleteat!(t.gctimes, i)
     return t
 end
 
 Base.length(t::Trial) = length(t.times)
 function Base.getindex(t::Trial, i::Number)
-    return push!(Trial(t.params), t.times[i], t.gctimes[i], t.memory, t.allocs)
+    return push!(
+        Trial(t.params),
+        t.times[i],
+        t.instructions[i],
+        t.branches[i],
+        t.gctimes[i],
+        t.memory,
+        t.allocs,
+    )
+end
+function Base.getindex(t::Trial, i)
+    return Trial(
+        t.params,
+        t.times[i],
+        t.instructions[i],
+        t.branches[i],
+        t.gctimes[i],
+        t.memory,
+        t.allocs,
+    )
 end
-Base.getindex(t::Trial, i) = Trial(t.params, t.times[i], t.gctimes[i], t.memory, t.allocs)
 Base.lastindex(t::Trial) = length(t)
 
 function Base.sort!(t::Trial)
     inds = sortperm(t.times)
     t.times = t.times[inds]
+    t.instructions = t.instructions[inds]
+    t.branches = t.branches[inds]
     t.gctimes = t.gctimes[inds]
     return t
 end
@@ -55,6 +103,8 @@ end
 Base.sort(t::Trial) = sort!(copy(t))
 
 Base.time(t::Trial) = time(minimum(t))
+instructions(t::Trial) = instructions(minimum(t))
+branches(t::Trial) = branches(minimum(t))
 gctime(t::Trial) = gctime(minimum(t))
 memory(t::Trial) = t.memory
 allocs(t::Trial) = t.allocs
@@ -95,47 +145,91 @@ end
 mutable struct TrialEstimate
     params::Parameters
     time::Float64
+    instructions::Float64
+    branches::Float64
     gctime::Float64
     memory::Int
     allocs::Int
 end
 
-function TrialEstimate(trial::Trial, t, gct)
-    return TrialEstimate(params(trial), t, gct, memory(trial), allocs(trial))
+function TrialEstimate(trial::Trial, t, instructions, branches, gctime)
+    return TrialEstimate(
+        params(trial), t, instructions, branches, gctime, memory(trial), allocs(trial)
+    )
 end
 
 function Base.:(==)(a::TrialEstimate, b::TrialEstimate)
     return a.params == b.params &&
            a.time == b.time &&
+           eq_including_nan(a.instructions, b.instructions) &&
+           eq_including_nan(a.branches, b.branches) &&
            a.gctime == b.gctime &&
            a.memory == b.memory &&
            a.allocs == b.allocs
 end
 
 function Base.copy(t::TrialEstimate)
-    return TrialEstimate(copy(t.params), t.time, t.gctime, t.memory, t.allocs)
+    return TrialEstimate(
+        copy(t.params), t.time, t.instructions, t.branches, t.gctime, t.memory, t.allocs
+    )
 end
 
 function Base.minimum(trial::Trial)
     i = argmin(trial.times)
-    return TrialEstimate(trial, trial.times[i], trial.gctimes[i])
+    return TrialEstimate(
+        trial, trial.times[i], trial.instructions[i], trial.branches[i], trial.gctimes[i]
+    )
 end
 
 function Base.maximum(trial::Trial)
     i = argmax(trial.times)
-    return TrialEstimate(trial, trial.times[i], trial.gctimes[i])
+    return TrialEstimate(
+        trial, trial.times[i], trial.instructions[i], trial.branches[i], trial.gctimes[i]
+    )
 end
 
 function Statistics.median(trial::Trial)
-    return TrialEstimate(trial, median(trial.times), median(trial.gctimes))
+    return TrialEstimate(
+        trial,
+        median(trial.times),
+        any(!isnan, trial.instructions) ? median(filter(!isnan, trial.instructions)) : NaN,
+        any(!isnan, trial.branches) ? median(filter(!isnan, trial.branches)) : NaN,
+        median(trial.gctimes),
+    )
+end
+function Statistics.mean(trial::Trial)
+    return TrialEstimate(
+        trial,
+        mean(trial.times),
+        mean(filter(!isnan, trial.instructions)),
+        mean(filter(!isnan, trial.branches)),
+        mean(trial.gctimes),
+    )
+end
+function Statistics.var(trial::Trial)
+    return TrialEstimate(
+        trial,
+        var(trial.times),
+        var(filter(!isnan, trial.instructions)),
+        var(filter(!isnan, trial.branches)),
+        var(trial.gctimes),
+    )
+end
+function Statistics.std(trial::Trial)
+    return TrialEstimate(
+        trial,
+        std(trial.times),
+        std(filter(!isnan, trial.instructions)),
+        std(filter(!isnan, trial.branches)),
+        std(trial.gctimes),
+    )
 end
-Statistics.mean(trial::Trial) = TrialEstimate(trial, mean(trial.times), mean(trial.gctimes))
-Statistics.var(trial::Trial) = TrialEstimate(trial, var(trial.times), var(trial.gctimes))
-Statistics.std(trial::Trial) = TrialEstimate(trial, std(trial.times), std(trial.gctimes))
 
 Base.isless(a::TrialEstimate, b::TrialEstimate) = isless(time(a), time(b))
 
 Base.time(t::TrialEstimate) = t.time
+instructions(t::TrialEstimate) = !isnan(t.instructions) ? t.instructions : nothing
+branches(t::TrialEstimate) = !isnan(t.branches) ? t.branches : nothing
 gctime(t::TrialEstimate) = t.gctime
 memory(t::TrialEstimate) = t.memory
 allocs(t::TrialEstimate) = t.allocs
@@ -148,6 +242,8 @@ params(t::TrialEstimate) = t.params
 mutable struct TrialRatio
     params::Parameters
     time::Float64
+    instructions::Float64
+    branches::Float64
     gctime::Float64
     memory::Float64
     allocs::Float64
@@ -156,14 +252,22 @@ end
 function Base.:(==)(a::TrialRatio, b::TrialRatio)
     return a.params == b.params &&
            a.time == b.time &&
+           eq_including_nan(a.instructions, b.instructions) &&
+           eq_including_nan(a.branches, b.branches) &&
            a.gctime == b.gctime &&
            a.memory == b.memory &&
            a.allocs == b.allocs
 end
 
-Base.copy(t::TrialRatio) = TrialRatio(copy(t.params), t.time, t.gctime, t.memory, t.allocs)
+function Base.copy(t::TrialRatio)
+    return TrialRatio(
+        copy(t.params), t.time, t.instructions, t.branches, t.gctime, t.memory, t.allocs
+    )
+end
 
 Base.time(t::TrialRatio) = t.time
+instructions(t::TrialRatio) = !isnan(t.instructions) ? t.instructions : nothing
+branches(t::TrialRatio) = !isnan(t.branches) ? t.branches : nothing
 gctime(t::TrialRatio) = t.gctime
 memory(t::TrialRatio) = t.memory
 allocs(t::TrialRatio) = t.allocs
@@ -183,6 +287,8 @@ function ratio(a::TrialEstimate, b::TrialEstimate)
     return TrialRatio(
         p,
         ratio(time(a), time(b)),
+        ratio(instructions(a), instructions(b)),
+        ratio(branches(a), branches(b)),
         ratio(gctime(a), gctime(b)),
         ratio(memory(a), memory(b)),
         ratio(allocs(a), allocs(b)),
@@ -289,6 +395,19 @@ function prettymemory(b)
     return string(@sprintf("%.2f", value), " ", units)
 end
 
+function prettycount(b; base_unit="")
+    if b < 1000
+        value, units = b, base_unit
+    elseif b < 1000^2
+        value, units = b / 1000, "K" * base_unit
+    elseif b < 1000^3
+        value, units = b / 1000^2, "M" * base_unit
+    else
+        value, units = b / 1000^3, "G" * base_unit
+    end
+    return string(@sprintf("%.2f", value), " ", units)
+end
+
 function withtypename(f, io, t)
     needtype = get(io, :typeinfo, Nothing) !== typeof(t)
     if needtype
@@ -382,6 +501,8 @@ function Base.show(io::IO, ::MIME"text/plain", t::Trial)
 
     perm = sortperm(t.times)
     times = t.times[perm]
+    instructions = t.instructions[perm]
+    branches = t.branches[perm]
     gctimes = t.gctimes[perm]
 
     if length(t) > 1
@@ -565,9 +686,19 @@ function Base.show(io::IO, ::MIME"text/plain", t::Trial)
 end
 
 function Base.show(io::IO, ::MIME"text/plain", t::TrialEstimate)
-    println(io, "BenchmarkTools.TrialEstimate: ")
+    println(io, "BenchmarkTools.TrialEstimate:")
     pad = get(io, :pad, "")
     println(io, pad, "  time:             ", prettytime(time(t)))
+    if instructions(t) !== nothing
+        println(
+            io, pad, "  instructions:     ", prettycount(instructions(t); base_unit="insts")
+        )
+    end
+    if branches(t) !== nothing
+        println(
+            io, pad, "  branches:         ", prettycount(branches(t); base_unit="branches")
+        )
+    end
     println(
         io,
         pad,
@@ -585,6 +716,12 @@ function Base.show(io::IO, ::MIME"text/plain", t::TrialRatio)
     println(io, "BenchmarkTools.TrialRatio: ")
     pad = get(io, :pad, "")
     println(io, pad, "  time:             ", time(t))
+    if instructions(t) !== nothing
+        println(io, pad, "  instructions:     ", instructions(t))
+    end
+    if branches(t) !== nothing
+        println(io, pad, "  branches:         ", branches(t))
+    end
     println(io, pad, "  gctime:           ", gctime(t))
     println(io, pad, "  memory:           ", memory(t))
     return print(io, pad, "  allocs:           ", allocs(t))
diff --git a/test/ExecutionTests.jl b/test/ExecutionTests.jl
index 57666815..b9cef1ca 100644
--- a/test/ExecutionTests.jl
+++ b/test/ExecutionTests.jl
@@ -261,7 +261,13 @@ for (tf, rex1, rex2) in (
     ioctx = IOContext(io, :logbins => tf)
     # A flat distribution won't trigger log by default
     b = BenchmarkTools.Trial(
-        BenchmarkTools.DEFAULT_PARAMETERS, 0.001 * (1:100) * 1e9, zeros(100), 0, 0
+        BenchmarkTools.DEFAULT_PARAMETERS,
+        0.001 * (1:100) * 1e9,
+        zeros(100),
+        zeros(100),
+        zeros(100),
+        0,
+        0,
     )
     show(ioctx, MIME("text/plain"), b)
     str = String(take!(io))
@@ -273,6 +279,8 @@ for (tf, rex1, rex2) in (
         BenchmarkTools.DEFAULT_PARAMETERS,
         t / sum(t) * 1e9 * BenchmarkTools.DEFAULT_PARAMETERS.seconds,
         zeros(100),
+        zeros(100),
+        zeros(100),
         0,
         0,
     )
diff --git a/test/GroupsTests.jl b/test/GroupsTests.jl
index 8a7a5dcd..2b6b9432 100644
--- a/test/GroupsTests.jl
+++ b/test/GroupsTests.jl
@@ -15,11 +15,13 @@ seteq(a, b) = length(a) == length(b) == length(intersect(a, b))
 
 g1 = BenchmarkGroup(["1", "2"])
 
-t1a = TrialEstimate(Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 32, 1, 2, 3)
+t1a = TrialEstimate(
+    Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 32, 0, 0, 1, 2, 3
+)
 t1b = TrialEstimate(
-    Parameters(; time_tolerance=0.40, memory_tolerance=0.40), 4123, 123, 43, 9
+    Parameters(; time_tolerance=0.40, memory_tolerance=0.40), 4123, 0, 0, 123, 43, 9
 )
-tc = TrialEstimate(Parameters(; time_tolerance=1.0, memory_tolerance=1.0), 1, 1, 1, 1)
+tc = TrialEstimate(Parameters(; time_tolerance=1.0, memory_tolerance=1.0), 1, 0, 0, 1, 1, 1)
 
 g1["a"] = t1a
 g1["b"] = t1b
@@ -30,16 +32,26 @@ g1similar = similar(g1)
 
 g2 = BenchmarkGroup(["2", "3"])
 
-t2a = TrialEstimate(Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 323, 1, 2, 3)
+t2a = TrialEstimate(
+    Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 323, 0, 0, 1, 2, 3
+)
 t2b = TrialEstimate(
-    Parameters(; time_tolerance=0.40, memory_tolerance=0.40), 1002, 123, 43, 9
+    Parameters(; time_tolerance=0.40, memory_tolerance=0.40), 1002, 0, 0, 123, 43, 9
 )
 
 g2["a"] = t2a
 g2["b"] = t2b
 g2["c"] = tc
 
-trial = BenchmarkTools.Trial(Parameters(), [1, 2, 5], [0, 1, 1], 3, 56)
+trial = BenchmarkTools.Trial(
+    Parameters(),
+    Float64[1, 2, 5],
+    Float64[0, 0, 0],
+    Float64[0, 0, 0],
+    Float64[0, 1, 1],
+    3,
+    56,
+)
 
 gtrial = BenchmarkGroup([], Dict("t" => trial))
 
@@ -151,10 +163,10 @@ groupsa["g1"] = g1
 groupsa["g2"] = g2
 g3a = addgroup!(groupsa, "g3", ["3", "4"])
 g3a["c"] = TrialEstimate(
-    Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 6341, 23, 41, 536
+    Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 6341, 0, 0, 23, 41, 536
 )
 g3a["d"] = TrialEstimate(
-    Parameters(; time_tolerance=0.13, memory_tolerance=0.13), 12341, 3013, 2, 150
+    Parameters(; time_tolerance=0.13, memory_tolerance=0.13), 12341, 0, 0, 3013, 2, 150
 )
 
 groups_copy = copy(groupsa)
@@ -165,10 +177,10 @@ groupsb["g1"] = g1
 groupsb["g2"] = g2
 g3b = addgroup!(groupsb, "g3", ["3", "4"])
 g3b["c"] = TrialEstimate(
-    Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 1003, 23, 41, 536
+    Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 1003, 0, 0, 23, 41, 536
 )
 g3b["d"] = TrialEstimate(
-    Parameters(; time_tolerance=0.23, memory_tolerance=0.23), 25341, 3013, 2, 150
+    Parameters(; time_tolerance=0.23, memory_tolerance=0.23), 25341, 0, 0, 3013, 2, 150
 )
 
 groupstrial = BenchmarkGroup()
diff --git a/test/SerializationTests.jl b/test/SerializationTests.jl
index e24314a1..f374b10e 100644
--- a/test/SerializationTests.jl
+++ b/test/SerializationTests.jl
@@ -6,7 +6,8 @@ using Test
 function eq(x::T, y::T) where {T<:Union{values(BenchmarkTools.SUPPORTED_TYPES)...}}
     return all(i -> eq(getfield(x, i), getfield(y, i)), 1:fieldcount(T))
 end
-eq(x::T, y::T) where {T} = isapprox(x, y)
+eq(x::Vector{Float64}, y::Vector{Float64}) = all(eq.(x, y))
+eq(x::T, y::T) where {T} = (x === y) || isapprox(x, y)
 
 function withtempdir(f::Function)
     d = mktempdir()
@@ -113,7 +114,7 @@ end
         [BenchmarkTools.Parameters(5.0, 10000, 1, true, 0.0, true, false, 0.05, 0.01)]
 end
 
-@testset "Inf in Paramters struct" begin
+@testset "Inf in Parameters struct" begin
     params = BenchmarkTools.Parameters(Inf, 10000, 1, false, Inf, true, false, Inf, Inf)
 
     io = IOBuffer()
@@ -124,4 +125,36 @@ end
     @test BenchmarkTools.load(json_io) == [params]
 end
 
+@testset "NaN in Trial" begin
+    trial1 = BenchmarkTools.Trial(
+        BenchmarkTools.Parameters(), [0.49, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], 2, 1
+    )
+    trial2 = BenchmarkTools.Trial(
+        BenchmarkTools.Parameters(), [0.49, 0.0], [NaN, NaN], [NaN, NaN], [0.0, 0.0], 2, 1
+    )
+
+    io = IOBuffer()
+    BenchmarkTools.save(io, trial1, trial2)
+    json_string = String(take!(io))
+    json_io = IOBuffer(json_string)
+
+    @test BenchmarkTools.load(json_io) == [trial1, trial2]
+end
+
+@testset "NaN in TrialEstimate" begin
+    trial_estimate1 = BenchmarkTools.TrialEstimate(
+        BenchmarkTools.Parameters(), 0.49, 0, 0, 0.0, 2, 1
+    )
+    trial_estimate2 = BenchmarkTools.TrialEstimate(
+        BenchmarkTools.Parameters(), 0.49, NaN, NaN, 0.0, 2, 1
+    )
+
+    io = IOBuffer()
+    BenchmarkTools.save(io, trial_estimate1, trial_estimate2)
+    json_string = String(take!(io))
+    json_io = IOBuffer(json_string)
+
+    @test BenchmarkTools.load(json_io) == [trial_estimate1, trial_estimate2]
+end
+
 end # module
diff --git a/test/TrialsTests.jl b/test/TrialsTests.jl
index c7ab8396..65f0dc3a 100644
--- a/test/TrialsTests.jl
+++ b/test/TrialsTests.jl
@@ -8,14 +8,14 @@ using Test
 #########
 
 trial1 = BenchmarkTools.Trial(BenchmarkTools.Parameters(; evals=2))
-push!(trial1, 2, 1, 4, 5)
-push!(trial1, 21, 0, 41, 51)
+push!(trial1, 2, 15, 2, 1, 4, 5)
+push!(trial1, 21, 17, 3, 0, 41, 51)
 
 trial2 = BenchmarkTools.Trial(BenchmarkTools.Parameters(; time_tolerance=0.15))
-push!(trial2, 21, 0, 41, 51)
-push!(trial2, 2, 1, 4, 5)
+push!(trial2, 21, 17, 3, 0, 41, 51)
+push!(trial2, 2, 15, 2, 1, 4, 5)
 
-push!(trial2, 21, 0, 41, 51)
+push!(trial2, 21, 17, 3, 0, 41, 51)
 @test length(trial2) == 3
 deleteat!(trial2, 3)
 @test length(trial1) == length(trial2) == 2
@@ -25,6 +25,8 @@ sort!(trial2)
 @test trial2.params ==
     BenchmarkTools.Parameters(; time_tolerance=trial2.params.time_tolerance)
 @test trial1.times == trial2.times == [2.0, 21.0]
+@test trial1.instructions == trial2.instructions == [15.0, 17.0]
+@test trial1.branches == trial2.branches == [2.0, 3.0]
 @test trial1.gctimes == trial2.gctimes == [1.0, 0.0]
 @test trial1.memory == trial2.memory == 4
 @test trial1.allocs == trial2.allocs == 5
@@ -34,10 +36,12 @@ trial2.params = trial1.params
 @test trial1 == trial2
 
 @test trial1[2] ==
-    push!(BenchmarkTools.Trial(BenchmarkTools.Parameters(; evals=2)), 21, 0, 4, 5)
+    push!(BenchmarkTools.Trial(BenchmarkTools.Parameters(; evals=2)), 21, 17, 3, 0, 4, 5)
 @test trial1[1:end] == trial1
 
 @test time(trial1) == time(trial2) == 2.0
+@test instructions(trial1) == instructions(trial2) == 15.0
+@test branches(trial1) == branches(trial2) == 2.0
 @test gctime(trial1) == gctime(trial2) == 1.0
 @test memory(trial1) == memory(trial2) == trial1.memory
 @test allocs(trial1) == allocs(trial2) == trial1.allocs
@@ -45,7 +49,13 @@ trial2.params = trial1.params
 
 # outlier trimming
 trial3 = BenchmarkTools.Trial(
-    BenchmarkTools.Parameters(), [1, 2, 3, 10, 11], [1, 1, 1, 1, 1], 1, 1
+    BenchmarkTools.Parameters(),
+    [1, 2, 3, 10, 11],
+    [0, 0, 0, 0, 0],
+    [0, 0, 0, 0, 0],
+    [1, 1, 1, 1, 1],
+    1,
+    1,
 )
 
 trimtrial3 = rmskew(trial3)
@@ -61,11 +71,11 @@ rmskew!(trial3)
 randtrial = BenchmarkTools.Trial(BenchmarkTools.Parameters())
 
 for _ in 1:40
-    push!(randtrial, rand(1:20), 1, 1, 1)
+    push!(randtrial, rand(1:20), 1, 1, 1, 1, 1)
 end
 
 while mean(randtrial) <= median(randtrial)
-    push!(randtrial, rand(10:20), 1, 1, 1)
+    push!(randtrial, rand(10:20), 1, 1, 1, 1, 1)
 end
 
 rmskew!(randtrial)
@@ -78,6 +88,8 @@ tstd = std(randtrial)
 tmax = maximum(randtrial)
 
 @test time(tmin) == time(randtrial)
+@test instructions(tmin) == instructions(randtrial)
+@test branches(tmin) == branches(randtrial)
 @test gctime(tmin) == gctime(randtrial)
 @test memory(tmin) ==
     memory(tmed) ==
@@ -117,14 +129,16 @@ x, y = rand(randrange), rand(randrange)
 @test ratio(0.0, 0.0) == 1.0
 
 ta = BenchmarkTools.TrialEstimate(
-    BenchmarkTools.Parameters(), rand(), rand(), rand(Int), rand(Int)
+    BenchmarkTools.Parameters(), rand(), rand(), rand(), rand(), rand(Int), rand(Int)
 )
 tb = BenchmarkTools.TrialEstimate(
-    BenchmarkTools.Parameters(), rand(), rand(), rand(Int), rand(Int)
+    BenchmarkTools.Parameters(), rand(), rand(), rand(), rand(), rand(Int), rand(Int)
 )
 tr = ratio(ta, tb)
 
 @test time(tr) == ratio(time(ta), time(tb))
+@test instructions(tr) == ratio(instructions(ta), instructions(tb))
+@test branches(tr) == ratio(branches(ta), branches(tb))
 @test gctime(tr) == ratio(gctime(ta), gctime(tb))
 @test memory(tr) == ratio(memory(ta), memory(tb))
 @test allocs(tr) == ratio(allocs(ta), allocs(tb))
@@ -138,10 +152,22 @@ tr = ratio(ta, tb)
 ##################
 
 ta = BenchmarkTools.TrialEstimate(
-    BenchmarkTools.Parameters(; time_tolerance=0.50, memory_tolerance=0.50), 0.49, 0.0, 2, 1
+    BenchmarkTools.Parameters(; time_tolerance=0.50, memory_tolerance=0.50),
+    0.49,
+    0,
+    0,
+    0.0,
+    2,
+    1,
 )
 tb = BenchmarkTools.TrialEstimate(
-    BenchmarkTools.Parameters(; time_tolerance=0.05, memory_tolerance=0.05), 1.00, 0.0, 1, 1
+    BenchmarkTools.Parameters(; time_tolerance=0.05, memory_tolerance=0.05),
+    1.00,
+    0,
+    0,
+    0.0,
+    1,
+    1,
 )
 tr = ratio(ta, tb)
 tj_ab = judge(ta, tb)
@@ -222,6 +248,13 @@ tj_r_2 = judge(tr; time_tolerance=2.0, memory_tolerance=2.0)
 @test BenchmarkTools.prettytime(999_999_999) == "1000.000 ms"
 @test BenchmarkTools.prettytime(1_000_000_000) == "1.000 s"
 
+@test BenchmarkTools.prettycount(999; base_unit="trials") == "999.00 trials"
+@test BenchmarkTools.prettycount(1000; base_unit="trials") == "1.00 Ktrials"
+@test BenchmarkTools.prettycount(999_999; base_unit="trials") == "1000.00 Ktrials"
+@test BenchmarkTools.prettycount(1_000_000; base_unit="trials") == "1.00 Mtrials"
+@test BenchmarkTools.prettycount(999_999_999; base_unit="trials") == "1000.00 Mtrials"
+@test BenchmarkTools.prettycount(1_000_000_000; base_unit="trials") == "1.00 Gtrials"
+
 @test BenchmarkTools.prettymemory(1023) == "1023 bytes"
 @test BenchmarkTools.prettymemory(1024) == "1.00 KiB"
 @test BenchmarkTools.prettymemory(1048575) == "1024.00 KiB"
@@ -230,7 +263,26 @@ tj_r_2 = judge(tr; time_tolerance=2.0, memory_tolerance=2.0)
 @test BenchmarkTools.prettymemory(1073741824) == "1.00 GiB"
 
 @test sprint(show, "text/plain", ta) == sprint(show, ta; context=:compact => false) == """
-BenchmarkTools.TrialEstimate: 
+BenchmarkTools.TrialEstimate:
+  time:             0.490 ns
+  instructions:     0.00 insts
+  branches:         0.00 branches
+  gctime:           0.000 ns (0.00%)
+  memory:           2 bytes
+  allocs:           1"""
+
+tc = BenchmarkTools.TrialEstimate(
+    BenchmarkTools.Parameters(; time_tolerance=0.50, memory_tolerance=0.50),
+    0.49,
+    NaN,
+    NaN,
+    0.0,
+    2,
+    1,
+)
+
+@test sprint(show, "text/plain", tc) == """
+BenchmarkTools.TrialEstimate:
   time:             0.490 ns
   gctime:           0.000 ns (0.00%)
   memory:           2 bytes
@@ -245,7 +297,9 @@ BenchmarkTools.TrialEstimate:
 
 @test sprint(show, [ta, tb]) == "BenchmarkTools.TrialEstimate[0.490 ns, 1.000 ns]"
 
-trial1sample = BenchmarkTools.Trial(BenchmarkTools.Parameters(), [1], [1], 1, 1)
+trial1sample = BenchmarkTools.Trial(
+    BenchmarkTools.Parameters(), [1.0], [1.0], [1.0], [1.0], 1, 1
+)
 @test try
     display(trial1sample)
     true
@@ -266,7 +320,9 @@ else
      1.000 ns"""
 end
 
-trial = BenchmarkTools.Trial(BenchmarkTools.Parameters(), [1.0, 1.01], [0.0, 0.0], 0, 0)
+trial = BenchmarkTools.Trial(
+    BenchmarkTools.Parameters(), [1.0, 1.01], [0.0, 0.0], [0, 0], [0.0, 0.0], 0, 0
+)
 @test sprint(show, "text/plain", trial) == """
 BenchmarkTools.Trial: 2 samples with 1 evaluation.
  Range (min … max):  1.000 ns … 1.010 ns  ┊ GC (min … max): 0.00% … 0.00%
diff --git a/test/runtests.jl b/test/runtests.jl
index 6f58393a..98c20fb4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -34,3 +34,11 @@ println("done (took ", took_seconds, " seconds)")
 print("Testing serialization...")
 took_seconds = @elapsed include("SerializationTests.jl")
 println("done (took ", took_seconds, " seconds)")
+
+@static if Sys.islinux()
+    using LinuxPerf
+
+    print("Testing execution (w/ LinuxPerf)...")
+    took_seconds = @elapsed include("ExecutionTests.jl")
+    println("done (took ", took_seconds, " seconds)")
+end