Skip to content

Commit

Permalink
Reduce generated functions: getindex (#28)
Browse files Browse the repository at this point in the history
* Convert getindex(::Blob, ::Val{field}) from generated func.

- The produced code is unchanged, and the perf remains the same.
- This is tested by a new testitem

* Eliminate another generated function: self_size!

Managed via compiler annotations

This new function is ~10x faster than the older `@generated` function:
- ~10ms down to ~1ms

* Reorganize to hopefully minimize the generated part of unsafe_store!

* Recursion for computing blob_offset

* Get unsafe_load() un-generated as well! :)

* Bump to v1.1

* Switch unsafe_store! to non-generated as well.

2x slower, but less compile time so worth it.

* Improve compile-time perf on self_size computation

* De-val getindex, which fixes getproperty codegen. (Unclear why)

```
julia> @code_llvm debuginfo=:none (b->b.x)(b)
define void @"julia_#35_644"([3 x i64]* noalias nocapture noundef nonnull sret([3 x i64]) align 8 dereferenceable(24) %0, [3 x i64]* nocapture noundef nonnull readonly align 8 dereferenceable(24) %1) #0 {
top:
  %memcpy_refined_src1 = getelementptr inbounds [3 x i64], [3 x i64]* %1, i64 0, i64 2
  %2 = load i64, i64* %memcpy_refined_src1, align 8
  %3 = bitcast [3 x i64]* %1 to <2 x i64>*
  %4 = load <2 x i64>, <2 x i64>* %3, align 8
  %5 = bitcast [3 x i64]* %0 to <2 x i64>*
  store <2 x i64> %4, <2 x i64>* %5, align 8
  %newstruct.sroa.3.0..sroa_idx4 = getelementptr inbounds [3 x i64], [3 x i64]* %0, i64 0, i64 2
  store i64 %2, i64* %newstruct.sroa.3.0..sroa_idx4, align 8
  ret void
}
```

* Force fieldidx to const-fold, even for large (100 field) structs

According to @benchmark, this made compiling getproperty and setproperty
for 100-field structs take 2ms longer (7ms -> 9ms)

* Add tests for allocations

* Noinline the function to actually throw the assertion

---------

Co-authored-by: Sacha Verweij <[email protected]>
  • Loading branch information
NHDaly and Sacha0 authored Jan 17, 2025
1 parent a938a10 commit 9773e2b
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 60 deletions.
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Blobs"
uuid = "163b9779-6631-5f90-a265-3de947924de8"
authors = []
version = "1.0.0"
version = "1.1.0"

[deps]
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
Expand All @@ -12,8 +12,9 @@ ReTestItems = "1"
julia = "1.3"

[extras]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
ReTestItems = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["ReTestItems", "Test"]
test = ["BenchmarkTools", "ReTestItems", "Test"]
159 changes: 101 additions & 58 deletions src/blob.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ function Blob{T}(blob::Blob) where T
end

function assert_same_allocation(blob1::Blob, blob2::Blob)
@assert getfield(blob1, :base) == getfield(blob2, :base) "These blobs do not share the same allocation: $blob1 - $blob2"
@noinline _throw(blob1, blob2) =
throw(AssertionError("These blobs do not share the same allocation: $blob1 - $blob2"))
if getfield(blob1, :base) != getfield(blob2, :base)
_throw(blob1, blob2)
end
end

function Base.pointer(blob::Blob{T}) where T
Expand Down Expand Up @@ -63,36 +67,61 @@ The number of bytes needed to allocate `T` itself.
Defaults to `sizeof(T)`.
"""
@generated function self_size(::Type{T}) where T
@assert isconcretetype(T)
Base.@assume_effects :foldable function self_size(::Type{T}) where T
# This function is marked :foldable to encourage constant folding for this types-only
# static computation.
if isempty(fieldnames(T))
quote
$(Expr(:meta, :inline))
$(sizeof(T))
end
sizeof(T)
else
quote
$(Expr(:meta, :inline))
$(+(0, @splice i in 1:length(fieldnames(T)) begin
self_size(fieldtype(T, i))
end))
# Recursion is the fastest way to compile this, confirmed with benchmarks.
# Alternatives considered:
# - +(Iterators.map(self_size, fieldtypes(T))...)
# - _iterative_sum_field_sizes for-loop (below).
# Splatting is always slower, and breaks after ~30 fields.
# The for-loop is faster after around 15-30 fields, so we pick an
# arbitrary cutoff of 20:
if fieldcount(T) > 20
_iterative_sum_field_sizes(T)
else
_recursive_sum_field_sizes(T)
end
end
end
function _iterative_sum_field_sizes(::Type{T}) where T
out = 0
for f in fieldtypes(T)
out += Blobs.self_size(f)
end
out
end
Base.@assume_effects :foldable _recursive_sum_field_sizes(::Type{T}) where {T} =
_recursive_sum_field_sizes(T, Val(fieldcount(T)))
Base.@assume_effects :foldable _recursive_sum_field_sizes(::Type, ::Val{0}) = 0
Base.@assume_effects :foldable function _recursive_sum_field_sizes(::Type{T}, ::Val{i}) where {T,i}
return self_size(fieldtype(T, i)) + _recursive_sum_field_sizes(T, Val(i-1))
end

function blob_offset(::Type{T}, i::Int) where {T}
+(0, @splice j in 1:(i-1) begin
self_size(fieldtype(T, j))
end)
# Recursion scales better than splatting for large numbers of fields.
Base.@assume_effects :foldable @inline function blob_offset(::Type{T}, i::Int) where {T}
_recursive_sum_field_sizes(T, Val(i - 1))
end

@generated function Base.getindex(blob::Blob{T}, ::Type{Val{field}}) where {T, field}
i = findfirst(isequal(field), fieldnames(T))
@assert i != nothing "$T has no field $field"
quote
$(Expr(:meta, :inline))
Blob{$(fieldtype(T, i))}(blob + $(blob_offset(T, i)))
end
# Manually write a compile-time loop in the type domain, to enforce constant-folding the
# fieldidx even for large structs (with e.g. 100 fields). This might make compiling a touch
# slower, but it allows this to work for even large structs, like the manually-written
# `@generated` functions did before.
@inline function fieldidx(::Type{T}, ::Val{field}) where {T,field}
return _fieldidx_lookup(T, Val(field), Val(fieldcount(T)))
end
_fieldidx_lookup(::Type{T}, ::Val{field}, ::Val{0}) where {T,field} =
error("$T has no field $field")
_fieldidx_lookup(::Type{T}, ::Val{field}, ::Val{i}) where {T,i,field} =
fieldname(T, i) === field ? i : _fieldidx_lookup(T, Val(field), Val(i-1))

@inline function Base.getindex(blob::Blob{T}, field::Symbol) where {T}
i = fieldidx(T, Val(field))
FT = fieldtype(T, i)
Blob{FT}(blob + blob_offset(T, i))
end

@inline function Base.getindex(blob::Blob{T}, i::Int) where {T}
Expand All @@ -112,47 +141,61 @@ Base.@propagate_inbounds function Base.setindex!(blob::Blob{T}, value) where T
setindex!(blob, convert(T, value))
end

@generated function Base.unsafe_load(blob::Blob{T}) where {T}
macro _make_new(type, args)
# :splatnew lets you directly invoke the type's inner constructor with a Tuple,
# bypassing any effects from any custom constructors.
return Expr(:splatnew, esc(type), esc(args))
end
@inline function Base.unsafe_load(blob::Blob{T}) where {T}
if isempty(fieldnames(T))
quote
$(Expr(:meta, :inline))
unsafe_load(pointer(blob))
end
unsafe_load(pointer(blob))
else
quote
$(Expr(:meta, :inline))
$(Expr(:new, T, @splice (i, field) in enumerate(fieldnames(T)) quote
unsafe_load(getindex(blob, $(Val{field})))
end))
end
# This recursive definition is *almost* as fast as the `@generated` code. On julia
# 1.10, it has a single invoke function call here, which adds a few ns overhead.
# But on julia 1.11, this generates the expected code and is just as fast.
# We are sticking with this version though, to save the `@generated` compilation time.
@_make_new(T, _unsafe_load_fields(blob, Val(fieldcount(T))))
end
end
@inline _unsafe_load_fields(::Blob, ::Val{0}) = ()
function _unsafe_load_fields(blob::Blob{T}, ::Val{I}) where {T, I}
@inline
types = fieldnames(T)
return (_unsafe_load_fields(blob, Val(I-1))..., unsafe_load(getindex(blob, types[I])))
end

@generated function Base.unsafe_store!(blob::Blob{T}, value::T) where {T}
@inline function Base.unsafe_store!(blob::Blob{T}, value::T) where {T}
if isempty(fieldnames(T))
quote
$(Expr(:meta, :inline))
unsafe_store!(pointer(blob), value)
value
end
elseif T <: Tuple
quote
$(Expr(:meta, :inline))
$(@splice (i, field) in enumerate(fieldnames(T)) quote
unsafe_store!(getindex(blob, $(Val{field})), value[$field])
end)
value
end
unsafe_store!(pointer(blob), value)
value
else
quote
$(Expr(:meta, :inline))
$(@splice (i, field) in enumerate(fieldnames(T)) quote
unsafe_store!(getindex(blob, $(Val{field})), value.$field)
end)
value
end
_unsafe_store_struct!(blob, value, Val(fieldcount(T)))
value
end
end
# On julia 1.11, this is equivalantly fast to the `@generated` version.
# On julia 1.10, this is about 2x slower than generated for medium structs: ~10 ns vs ~5 ns.
# We will go with the recursive version, to avoid the compilation cost.
@inline _unsafe_store_struct!(::Blob{T}, ::T, ::Val{0}) where {T} = nothing
function _unsafe_store_struct!(blob::Blob{T}, value::T, ::Val{I}) where {T, I}
@inline
types = fieldnames(T)
_unsafe_store_struct!(blob, value, Val(I-1))
unsafe_store!(getindex(blob, types[I]), getproperty(value, types[I]))
nothing
end
# Recursive function for tuples is equivalent to unrolled via `@generated`.
function Base.unsafe_store!(blob::Blob{T}, value::T) where {T <: Tuple}
_unsafe_store_tuple!(blob, value, Val(fieldcount(T)))
value
end
@inline _unsafe_store_tuple!(::Blob{T}, ::T, ::Val{0}) where {T<:Tuple} = nothing
function _unsafe_store_tuple!(blob::Blob{T}, value::T, ::Val{I}) where {T<:Tuple, I}
@inline
_unsafe_store_struct!(blob, value, Val(I-1))
unsafe_store!(getindex(blob, I), value[I])
nothing
end

# if the value is the wrong type, try to convert it (just like setting a field normally)
function Base.unsafe_store!(blob::Blob{T}, value) where {T}
Expand All @@ -166,11 +209,11 @@ function Base.propertynames(::Blob{T}, private::Bool=false) where T
end

function Base.getproperty(blob::Blob{T}, field::Symbol) where T
getindex(blob, Val{field})
getindex(blob, field)
end

function Base.setproperty!(blob::Blob{T}, field::Symbol, value) where T
setindex!(blob, Val{field}, value)
setindex!(blob, Val(field), value)
end

function rewrite_address(expr)
Expand All @@ -185,7 +228,7 @@ function rewrite_address(expr)
else
error("Impossible?")
end
:(getindex($(rewrite_address(object)), $(Val{fieldname})))
:(getindex($(rewrite_address(object)), $(QuoteNode(fieldname))))
elseif expr.head == :ref
object = expr.args[1]
:(getindex($(rewrite_address(object)), $(map(esc, expr.args[2:end])...)))
Expand Down
77 changes: 77 additions & 0 deletions test/type-stability-tests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
@testitem "type-stability" begin
using BenchmarkTools

struct Quux
x::BlobVector{Int}
y::Float64
end
struct Bar
a::Int
b::BlobBitVector
c::Bool
d::BlobVector{Float64}
e::Blob{Quux}
end

@test Blobs.self_size(Bar) == 8 + 16 + 1 + 16 + 8 # Blob{Quux} is smaller in the blob

function Blobs.child_size(::Type{Quux}, x_len::Int64, y::Float64)
T = Quux
+(Blobs.child_size(fieldtype(T, :x), x_len))
end

function Blobs.child_size(::Type{Bar}, b_len::Int64, c::Bool, d_len::Int64, e_len::Int64, y::Float64)
T = Bar
+(Blobs.child_size(fieldtype(T, :b), b_len),
Blobs.child_size(fieldtype(T, :d), d_len),
Blobs.child_size(fieldtype(T, :e), e_len, y))
end

function Blobs.init(quux::Blob{Quux}, free::Blob{Nothing}, x_len::Int64, y::Float64)
free = Blobs.init(quux.x, free, x_len)
quux.y[] = y
free
end

function Blobs.init(bar::Blob{Bar}, free::Blob{Nothing}, b_len::Int64, c::Bool, d_len::Int64, e_len::Int64, y::Float64)
free = Blobs.init(bar.b, free, b_len)
free = Blobs.init(bar.d, free, d_len)
free = Blobs.init(bar.e, free, e_len, y)
bar.c[] = c
free
end

bar = Blobs.malloc_and_init(Bar, 10, false, 20, 15, 1.5)

# Test type stability
test_getproperty1(b) = b.e
test_getproperty2(b) = b.d
@testset "getindex" begin
@test @inferred(test_getproperty1(bar)) === bar.e
@test @ballocated(test_getproperty1($bar)) === 0
@test @inferred(test_getproperty2(bar)) === bar.d
@test @ballocated(test_getproperty2($bar)) === 0
end

@testset "unsafe_load" begin
@test @inferred(unsafe_load(bar)) isa Bar
@test @ballocated(unsafe_load($bar)) === 0
end

@testset "self_size" begin
@test @inferred(Blobs.self_size(Bar)) === 49
@test @ballocated(Blobs.self_size(Bar)) === 0
end

@testset "unsafe_store!" begin
bar_value = unsafe_load(bar)
@test @inferred(Blobs.unsafe_store!(bar, bar_value)) isa Bar
@test @ballocated(Blobs.unsafe_store!($bar, $bar_value)) === 0
end

read_and_write(bar) = (bar.e[].y[] = bar.a[])
@testset "load & store" begin
@test @inferred(read_and_write(bar)) isa Int
@test @ballocated(read_and_write($bar)) === 0
end
end

2 comments on commit 9773e2b

@NHDaly
Copy link
Member Author

@NHDaly NHDaly commented on 9773e2b Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/123214

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.1.0 -m "<description of version>" 9773e2b28be5abf91d4e6a39198c8fae325a3979
git push origin v1.1.0

Also, note the warning: Version 1.1.0 skips over 1.0.0
This can be safely ignored. However, if you want to fix this you can do so. Call register() again after making the fix. This will update the Pull request.

Please sign in to comment.