Reduce generated functions: getindex (#28)

* Convert getindex(::Blob, ::Val{field}) from generated func. - The produced code is unchanged, and the perf remains the same. - This is tested by a new testitem * Eliminate another generated function: self_size! Managed via compiler annotations This new function is ~10x faster than the older `@generated` function: - ~10ms down to ~1ms * Reorganize to hopefully minimize the generated part of unsafe_store! * Recursion for computing blob_offset * Get unsafe_load() un-generated as well! :) * Bump to v1.1 * Switch unsafe_store! to non-generated as well. 2x slower, but less compile time so worth it. * Improve compile-time perf on self_size computation * De-val getindex, which fixes getproperty codegen. (Unclear why) ``` julia> @code_llvm debuginfo=:none (b->b.x)(b) define void @"julia_#35_644"([3 x i64]* noalias nocapture noundef nonnull sret([3 x i64]) align 8 dereferenceable(24) %0, [3 x i64]* nocapture noundef nonnull readonly align 8 dereferenceable(24) %1) #0 { top: %memcpy_refined_src1 = getelementptr inbounds [3 x i64], [3 x i64]* %1, i64 0, i64 2 %2 = load i64, i64* %memcpy_refined_src1, align 8 %3 = bitcast [3 x i64]* %1 to <2 x i64>* %4 = load <2 x i64>, <2 x i64>* %3, align 8 %5 = bitcast [3 x i64]* %0 to <2 x i64>* store <2 x i64> %4, <2 x i64>* %5, align 8 %newstruct.sroa.3.0..sroa_idx4 = getelementptr inbounds [3 x i64], [3 x i64]* %0, i64 0, i64 2 store i64 %2, i64* %newstruct.sroa.3.0..sroa_idx4, align 8 ret void } ``` * Force fieldidx to const-fold, even for large (100 field) structs According to @benchmark, this made compiling getproperty and setproperty for 100-field structs take 2ms longer (7ms -> 9ms) * Add tests for allocations * Noinline the function to actually throw the assertion --------- Co-authored-by: Sacha Verweij <[email protected]>
RelationalAI-oss · Jan 17, 2025 · 9773e2b · 9773e2b · NHDaly · Jan 17, 2025
1 parent a938a10
commit 9773e2b
Show file tree

Hide file tree

Showing 3 changed files with 181 additions and 60 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Blobs"
 uuid = "163b9779-6631-5f90-a265-3de947924de8"
 authors = []
-version = "1.0.0"
+version = "1.1.0"
 
 [deps]
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
@@ -12,8 +12,9 @@ ReTestItems = "1"
 julia = "1.3"
 
 [extras]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 ReTestItems = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["ReTestItems", "Test"]
+test = ["BenchmarkTools", "ReTestItems", "Test"]
diff --git a/src/blob.jl b/src/blob.jl
@@ -25,7 +25,11 @@ function Blob{T}(blob::Blob) where T
 end
 
 function assert_same_allocation(blob1::Blob, blob2::Blob)
-    @assert getfield(blob1, :base) == getfield(blob2, :base) "These blobs do not share the same allocation: $blob1 - $blob2"
+    @noinline _throw(blob1, blob2) =
+        throw(AssertionError("These blobs do not share the same allocation: $blob1 - $blob2"))
+    if getfield(blob1, :base) != getfield(blob2, :base)
+        _throw(blob1, blob2)
+    end
 end
 
 function Base.pointer(blob::Blob{T}) where T
@@ -63,36 +67,61 @@ The number of bytes needed to allocate `T` itself.
 
 Defaults to `sizeof(T)`.
 """
-@generated function self_size(::Type{T}) where T
-    @assert isconcretetype(T)
+Base.@assume_effects :foldable function self_size(::Type{T}) where T
+    # This function is marked :foldable to encourage constant folding for this types-only
+    # static computation.
     if isempty(fieldnames(T))
-        quote
-            $(Expr(:meta, :inline))
-            $(sizeof(T))
-        end
+        sizeof(T)
     else
-        quote
-            $(Expr(:meta, :inline))
-            $(+(0, @splice i in 1:length(fieldnames(T)) begin
-                self_size(fieldtype(T, i))
-            end))
+        # Recursion is the fastest way to compile this, confirmed with benchmarks.
+        # Alternatives considered:
+        # - +(Iterators.map(self_size, fieldtypes(T))...)
+        # - _iterative_sum_field_sizes for-loop (below).
+        # Splatting is always slower, and breaks after ~30 fields.
+        # The for-loop is faster after around 15-30 fields, so we pick an
+        # arbitrary cutoff of 20:
+        if fieldcount(T) > 20
+            _iterative_sum_field_sizes(T)
+        else
+            _recursive_sum_field_sizes(T)
         end
     end
 end
+function _iterative_sum_field_sizes(::Type{T}) where T
+    out = 0
+    for f in fieldtypes(T)
+        out += Blobs.self_size(f)
+    end
+    out
+end
+Base.@assume_effects :foldable _recursive_sum_field_sizes(::Type{T}) where {T} =
+    _recursive_sum_field_sizes(T, Val(fieldcount(T)))
+Base.@assume_effects :foldable _recursive_sum_field_sizes(::Type, ::Val{0}) = 0
+Base.@assume_effects :foldable function _recursive_sum_field_sizes(::Type{T}, ::Val{i}) where {T,i}
+    return self_size(fieldtype(T, i)) + _recursive_sum_field_sizes(T, Val(i-1))
+end
 
-function blob_offset(::Type{T}, i::Int) where {T}
-    +(0, @splice j in 1:(i-1) begin
-        self_size(fieldtype(T, j))
-    end)
+# Recursion scales better than splatting for large numbers of fields.
+Base.@assume_effects :foldable @inline function blob_offset(::Type{T}, i::Int) where {T}
+    _recursive_sum_field_sizes(T, Val(i - 1))
 end
 
-@generated function Base.getindex(blob::Blob{T}, ::Type{Val{field}}) where {T, field}
-    i = findfirst(isequal(field), fieldnames(T))
-    @assert i != nothing "$T has no field $field"
-    quote
-        $(Expr(:meta, :inline))
-        Blob{$(fieldtype(T, i))}(blob + $(blob_offset(T, i)))
-    end
+# Manually write a compile-time loop in the type domain, to enforce constant-folding the
+# fieldidx even for large structs (with e.g. 100 fields). This might make compiling a touch
+# slower, but it allows this to work for even large structs, like the manually-written
+# `@generated` functions did before.
+@inline function fieldidx(::Type{T}, ::Val{field}) where {T,field}
+    return _fieldidx_lookup(T, Val(field), Val(fieldcount(T)))
+end
+_fieldidx_lookup(::Type{T}, ::Val{field}, ::Val{0}) where {T,field} =
+    error("$T has no field $field")
+_fieldidx_lookup(::Type{T}, ::Val{field}, ::Val{i}) where {T,i,field} =
+    fieldname(T, i) === field ? i : _fieldidx_lookup(T, Val(field), Val(i-1))
+
+@inline function Base.getindex(blob::Blob{T}, field::Symbol) where {T}
+    i = fieldidx(T, Val(field))
+    FT = fieldtype(T, i)
+    Blob{FT}(blob + blob_offset(T, i))
 end
 
 @inline function Base.getindex(blob::Blob{T}, i::Int) where {T}
@@ -112,47 +141,61 @@ Base.@propagate_inbounds function Base.setindex!(blob::Blob{T}, value) where T
     setindex!(blob, convert(T, value))
 end
 
-@generated function Base.unsafe_load(blob::Blob{T}) where {T}
+macro _make_new(type, args)
+    # :splatnew lets you directly invoke the type's inner constructor with a Tuple,
+    # bypassing any effects from any custom constructors.
+    return Expr(:splatnew, esc(type), esc(args))
+end
+@inline function Base.unsafe_load(blob::Blob{T}) where {T}
     if isempty(fieldnames(T))
-        quote
-            $(Expr(:meta, :inline))
-            unsafe_load(pointer(blob))
-        end
+        unsafe_load(pointer(blob))
     else
-        quote
-            $(Expr(:meta, :inline))
-            $(Expr(:new, T, @splice (i, field) in enumerate(fieldnames(T)) quote
-                unsafe_load(getindex(blob, $(Val{field})))
-            end))
-        end
+        # This recursive definition is *almost* as fast as the `@generated` code. On julia
+        # 1.10, it has a single invoke function call here, which adds a few ns overhead.
+        # But on julia 1.11, this generates the expected code and is just as fast.
+        # We are sticking with this version though, to save the `@generated` compilation time.
+        @_make_new(T, _unsafe_load_fields(blob, Val(fieldcount(T))))
     end
 end
+@inline _unsafe_load_fields(::Blob, ::Val{0}) = ()
+function _unsafe_load_fields(blob::Blob{T}, ::Val{I}) where {T, I}
+    @inline
+    types = fieldnames(T)
+    return (_unsafe_load_fields(blob, Val(I-1))..., unsafe_load(getindex(blob, types[I])))
+end
 
-@generated function Base.unsafe_store!(blob::Blob{T}, value::T) where {T}
+@inline function Base.unsafe_store!(blob::Blob{T}, value::T) where {T}
     if isempty(fieldnames(T))
-        quote
-            $(Expr(:meta, :inline))
-            unsafe_store!(pointer(blob), value)
-            value
-        end
-    elseif T <: Tuple
-        quote
-            $(Expr(:meta, :inline))
-            $(@splice (i, field) in enumerate(fieldnames(T)) quote
-                unsafe_store!(getindex(blob, $(Val{field})), value[$field])
-            end)
-            value
-        end
+        unsafe_store!(pointer(blob), value)
+        value
     else
-        quote
-            $(Expr(:meta, :inline))
-            $(@splice (i, field) in enumerate(fieldnames(T)) quote
-                unsafe_store!(getindex(blob, $(Val{field})), value.$field)
-            end)
-            value
-        end
+        _unsafe_store_struct!(blob, value, Val(fieldcount(T)))
+        value
     end
 end
+# On julia 1.11, this is equivalantly fast to the `@generated` version.
+# On julia 1.10, this is about 2x slower than generated for medium structs: ~10 ns vs ~5 ns.
+# We will go with the recursive version, to avoid the compilation cost.
+@inline _unsafe_store_struct!(::Blob{T}, ::T, ::Val{0}) where {T} = nothing
+function _unsafe_store_struct!(blob::Blob{T}, value::T, ::Val{I}) where {T, I}
+    @inline
+    types = fieldnames(T)
+    _unsafe_store_struct!(blob, value, Val(I-1))
+    unsafe_store!(getindex(blob, types[I]), getproperty(value, types[I]))
+    nothing
+end
+# Recursive function for tuples is equivalent to unrolled via `@generated`.
+function Base.unsafe_store!(blob::Blob{T}, value::T) where {T <: Tuple}
+    _unsafe_store_tuple!(blob, value, Val(fieldcount(T)))
+    value
+end
+@inline _unsafe_store_tuple!(::Blob{T}, ::T, ::Val{0}) where {T<:Tuple} = nothing
+function _unsafe_store_tuple!(blob::Blob{T}, value::T, ::Val{I}) where {T<:Tuple, I}
+    @inline
+    _unsafe_store_struct!(blob, value, Val(I-1))
+    unsafe_store!(getindex(blob, I), value[I])
+    nothing
+end
 
 # if the value is the wrong type, try to convert it (just like setting a field normally)
 function Base.unsafe_store!(blob::Blob{T}, value) where {T}
@@ -166,11 +209,11 @@ function Base.propertynames(::Blob{T}, private::Bool=false) where T
 end
 
 function Base.getproperty(blob::Blob{T}, field::Symbol) where T
-    getindex(blob, Val{field})
+    getindex(blob, field)
 end
 
 function Base.setproperty!(blob::Blob{T}, field::Symbol, value) where T
-    setindex!(blob, Val{field}, value)
+    setindex!(blob, Val(field), value)
 end
 
 function rewrite_address(expr)
@@ -185,7 +228,7 @@ function rewrite_address(expr)
         else
             error("Impossible?")
         end
-        :(getindex($(rewrite_address(object)), $(Val{fieldname})))
+        :(getindex($(rewrite_address(object)), $(QuoteNode(fieldname))))
     elseif expr.head == :ref
         object = expr.args[1]
         :(getindex($(rewrite_address(object)), $(map(esc, expr.args[2:end])...)))

diff --git a/test/type-stability-tests.jl b/test/type-stability-tests.jl
@@ -0,0 +1,77 @@
+@testitem "type-stability" begin
+    using BenchmarkTools
+
+    struct Quux
+        x::BlobVector{Int}
+        y::Float64
+    end
+    struct Bar
+        a::Int
+        b::BlobBitVector
+        c::Bool
+        d::BlobVector{Float64}
+        e::Blob{Quux}
+    end
+
+    @test Blobs.self_size(Bar) == 8 + 16 + 1 + 16 + 8 # Blob{Quux} is smaller in the blob
+
+    function Blobs.child_size(::Type{Quux}, x_len::Int64, y::Float64)
+        T = Quux
+        +(Blobs.child_size(fieldtype(T, :x), x_len))
+    end
+
+    function Blobs.child_size(::Type{Bar}, b_len::Int64, c::Bool, d_len::Int64, e_len::Int64, y::Float64)
+        T = Bar
+        +(Blobs.child_size(fieldtype(T, :b), b_len),
+        Blobs.child_size(fieldtype(T, :d), d_len),
+        Blobs.child_size(fieldtype(T, :e), e_len, y))
+    end
+
+    function Blobs.init(quux::Blob{Quux}, free::Blob{Nothing}, x_len::Int64, y::Float64)
+        free = Blobs.init(quux.x, free, x_len)
+        quux.y[] = y
+        free
+    end
+
+    function Blobs.init(bar::Blob{Bar}, free::Blob{Nothing}, b_len::Int64, c::Bool, d_len::Int64, e_len::Int64, y::Float64)
+        free = Blobs.init(bar.b, free, b_len)
+        free = Blobs.init(bar.d, free, d_len)
+        free = Blobs.init(bar.e, free, e_len, y)
+        bar.c[] = c
+        free
+    end
+
+    bar = Blobs.malloc_and_init(Bar, 10, false, 20, 15, 1.5)
+
+    # Test type stability
+    test_getproperty1(b) = b.e
+    test_getproperty2(b) = b.d
+    @testset "getindex" begin
+        @test @inferred(test_getproperty1(bar)) === bar.e
+        @test @ballocated(test_getproperty1($bar)) === 0
+        @test @inferred(test_getproperty2(bar)) === bar.d
+        @test @ballocated(test_getproperty2($bar)) === 0
+    end
+
+    @testset "unsafe_load" begin
+        @test @inferred(unsafe_load(bar)) isa Bar
+        @test @ballocated(unsafe_load($bar)) === 0
+    end
+
+    @testset "self_size" begin
+        @test @inferred(Blobs.self_size(Bar)) === 49
+        @test @ballocated(Blobs.self_size(Bar)) === 0
+    end
+
+    @testset "unsafe_store!" begin
+        bar_value = unsafe_load(bar)
+        @test @inferred(Blobs.unsafe_store!(bar, bar_value)) isa Bar
+        @test @ballocated(Blobs.unsafe_store!($bar, $bar_value)) === 0
+    end
+
+    read_and_write(bar) = (bar.e[].y[] = bar.a[])
+    @testset "load & store" begin
+        @test @inferred(read_and_write(bar)) isa Int
+        @test @ballocated(read_and_write($bar)) === 0
+    end
+end