diff --git a/src/QSM.jl b/src/QSM.jl index e3851c4..7695178 100644 --- a/src/QSM.jl +++ b/src/QSM.jl @@ -16,7 +16,7 @@ using SLEEFPirates: pow, sincos_fast using Static: known using StaticArrays: SVector using ThreadingUtilities: initialize_task -using TiledIteration: EdgeIterator, TileIterator, padded_tilesize +using TiledIteration: TileIterator, padded_tilesize using LinearAlgebra using FFTW @@ -52,11 +52,13 @@ end ##### ##### Polyester.jl ##### + function reset_threading() - # after user interrupt during @batch loop, threading has to be reset: + # if @batch loop gets interrupted, threading has to be reset: # https://github.com/JuliaSIMD/Polyester.jl/issues/30 + nt = min(nthreads(), (Sys.CPU_THREADS)::Int) - 1 reset_workers!() - foreach(initialize_task, 1:min(nthreads(), (Sys.CPU_THREADS)::Int) - 1) + foreach(initialize_task, 1:nt) return nothing end @@ -64,8 +66,8 @@ end ##### ##### FFTW.jl ##### -const FFTW_NTHREADS = Ref{Int}(known(num_cores())) +const FFTW_NTHREADS = Ref{Int}(known(num_cores())) @static if FFTW.fftw_provider == "fftw" # modified `FFTW.spawnloop` to use Polyester for multi-threading diff --git a/src/unwrap/laplacian.jl b/src/unwrap/laplacian.jl index 0a4d3a4..123056d 100644 --- a/src/unwrap/laplacian.jl +++ b/src/unwrap/laplacian.jl @@ -139,7 +139,7 @@ function wrapped_laplacian!( ) where {T<:AbstractFloat} checkshape(d2u, u, (:d2u, :u)) - nx, ny, nz, _ = size(u) + nx, ny, nz = size(u)[1:3] dx2, dy2, dz2 = convert.(T, inv.(dx.*dx)) tsz = padded_tilesize(T, (2, 2, 2), 1) @@ -173,7 +173,6 @@ end ##### Boundary treatment ##### -# unroll? probably too aggressive function wrapped_laplacian_boundary_neumann!( d2u::AbstractArray{<:AbstractFloat, N}, u::AbstractArray{T, N}, @@ -182,26 +181,17 @@ function wrapped_laplacian_boundary_neumann!( N ∈ (3, 4) || throw(ArgumentError("arrays must be 3d or 4d, got $(N)d")) checkshape(d2u, u, (:d2u, :u)) - sz = size(u) - nx, ny, nz = sz[1:3] + zeroT = zero(T) + nx, ny, nz = size(u)[1:3] dx2, dy2, dz2 = convert.(T, inv.(dx.*dx)) - _zero = zero(T) - - outer = CartesianIndices(ntuple(n -> 1:sz[n], Val(3))) - inner = CartesianIndices(ntuple(n -> 2:sz[n]-1, Val(3))) - E = EdgeIterator(outer, inner) - - R = Vector{NTuple{3, Int}}(undef, length(E)) - @inbounds for (i, I) in enumerate(E) - R[i] = Tuple(I) - end + R = edge_indices(axes(u)[1:3]) @inbounds for t in axes(u, 4) _u = @view(u[:,:,:,t]) _d2u = @view(d2u[:,:,:,t]) @batch per=thread for (i, j, k) in R - Δ = _zero + Δ = zeroT if k > 1 Δ = muladd(-dz2, _wrap(_u[i,j,k] - _u[i,j,k-1]), Δ) @@ -243,40 +233,32 @@ function wrapped_laplacian_boundary_periodic!( N ∈ (3, 4) || throw(ArgumentError("arrays must be 3d or 4d, got $(N)d")) checkshape(d2u, u, (:d2u, :u)) - sz = size(u) - nx, ny, nz = sz[1:3] + nx, ny, nz = size(u)[1:3] dx2, dy2, dz2 = convert.(T, inv.(dx.*dx)) - outer = CartesianIndices(ntuple(n -> 1:sz[n], Val(3))) - inner = CartesianIndices(ntuple(n -> 2:sz[n]-1, Val(3))) - E = EdgeIterator(outer, inner) - - R = Vector{NTuple{3, Int}}(undef, length(E)) - @inbounds for (i, I) in enumerate(E) - R[i] = Tuple(I) - end + R = edge_indices(axes(u)[1:3]) @inbounds for t in axes(u, 4) _u = @view(u[:,:,:,t]) _d2u = @view(d2u[:,:,:,t]) @batch per=thread for (i, j, k) in R - du = k == 1 ? _u[i,j,k] - _u[i,j,end] : _u[i,j,k] - _u[i,j,k-1] - Δ = -dz2 * _wrap(du) + v = k == 1 ? _u[i,j,end] : _u[i,j,k-1] + Δ = -dz2 * _wrap(_u[i,j,k] - v) - du = j == 1 ? _u[i,j,k] - _u[i,end,k] : _u[i,j,k] - _u[i,j-1,k] - Δ = muladd(-dy2, _wrap(du), Δ) + v = j == 1 ? _u[i,end,k] : _u[i,j-1,k] + Δ = muladd(-dy2, _wrap(_u[i,j,k] - v), Δ) - du = i == 1 ? _u[i,j,k] - _u[end,j,k] : _u[i,j,k] - _u[i-1,j,k] - Δ = muladd(-dx2, _wrap(du), Δ) + v = i == 1 ? _u[end,j,k] : _u[i-1,j,k] + Δ = muladd(-dx2, _wrap(_u[i,j,k] - v), Δ) - du = i == nx ? _u[1,j,k] - _u[i,j,k] : _u[i+1,j,k] - _u[i,j,k] - Δ = muladd(dx2, _wrap(du), Δ) + v = i == nx ? _u[1,j,k] : _u[i+1,j,k] + Δ = muladd(dx2, _wrap(v - _u[i,j,k]), Δ) - du = j == ny ? _u[i,1,k] - _u[i,j,k] : _u[i,j+1,k] - _u[i,j,k] - Δ = muladd(dy2, _wrap(du), Δ) + v = j == ny ? _u[i,1,k] : _u[i,j+1,k] + Δ = muladd(dy2, _wrap(v - _u[i,j,k]), Δ) - du = k == nz ? _u[i,j,1] - _u[i,j,k] : _u[i,j,k+1] - _u[i,j,k] - Δ = muladd(dz2, _wrap(du), Δ) + v = k == nz ? _u[i,j,1] : _u[i,j,k+1] + Δ = muladd(dz2, _wrap(v - _u[i,j,k]), Δ) _d2u[i,j,k] = Δ end diff --git a/src/utils/poisson_solver/multigrid/poisson.jl b/src/utils/poisson_solver/multigrid/poisson.jl index fb4a560..35113c6 100644 --- a/src/utils/poisson_solver/multigrid/poisson.jl +++ b/src/utils/poisson_solver/multigrid/poisson.jl @@ -261,13 +261,10 @@ function boundary_mask!( b = tzero(mb) - # array boundary - outer = CartesianIndices(ntuple(n -> 2:sz[n]-1, Val(3))) - inner = CartesianIndices(ntuple(n -> 3:sz[n]-2-Int(iseven(sz[n])), Val(3))) - if !isempty(inner) - @inbounds for I in EdgeIterator(outer, inner) - b[I] = m[I] - end + outer = ntuple(n -> 2:sz[n]-1, Val(3)) + inner = ntuple(n -> 3:sz[n]-2-Int(iseven(sz[n])), Val(3)) + _edgeloop(outer, inner) do i, j, k + @inbounds b[i,j,k] = m[i,j,k] end # interior diff --git a/src/utils/r2star.jl b/src/utils/r2star.jl index 6d78b79..a524bc6 100644 --- a/src/utils/r2star.jl +++ b/src/utils/r2star.jl @@ -23,8 +23,8 @@ function r2star_ll( N > 1 || throw(ArgumentError("array must contain echoes in last dimension")) NT > 1 || throw(ArgumentError("data must be multi-echo")) - size(mag, N) == NT || throw(DimensionMismatch()) - mask === nothing || length(mask) == length(mag) ÷ NT || throw(DimensionMismatch()) + checkshape((NT,), (size(mag, N),), (:TEs, :mag)) + mask !== nothing && checkshape(axes(mask), axes(mag)[1:N-1], (:mask, :mag)) r2s = similar(mag, size(mag)[1:N-1]) r2s = tfill!(r2s, zero(T)) @@ -37,34 +37,32 @@ function r2star_ll( if mask === nothing b = tmap(log, transpose(vmag)) x̂ = ldiv!(A, b) - @inbounds @batch for I in eachindex(vr2s) - vr2s[I] = x̂[1,I] + @batch for I in eachindex(vr2s) + @inbounds vr2s[I] = x̂[1,I] end else - vmask = vec(mask) - i = 0 - R = Vector{Int}(undef, sum(vmask)) - @inbounds for I in eachindex(vmask) - if vmask[I] + R = Vector{Int}(undef, sum(mask)) + @inbounds for I in eachindex(mask) + if mask[I] R[i += 1] = I end end b = similar(mag, (NT, length(R))) - @inbounds for t in Base.OneTo(NT) + for t in 1:NT bt = @view(b[t,:]) mt = @view(vmag[:,t]) @batch for I in eachindex(R) - bt[I] = log(mt[R[I]]) + @inbounds bt[I] = log(mt[R[I]]) end end x̂ = ldiv!(A, b) - @inbounds @batch for I in eachindex(R) - vr2s[R[I]] = x̂[1,I] + @batch for I in eachindex(R) + @inbounds vr2s[R[I]] = x̂[1,I] end end @@ -103,8 +101,8 @@ function r2star_arlo( N > 1 || throw(ArgumentError("array must contain echoes in last dimension")) NT > 2 || throw(ArgumentError("ARLO requires at least 3 echoes")) - size(mag, N) == NT || throw(DimensionMismatch()) - mask === nothing || length(mask) == length(mag) ÷ NT || throw(DimensionMismatch()) + checkshape((NT,), (size(mag, N),), (:TEs, :mag)) + mask !== nothing && checkshape(axes(mask), axes(mag)[1:N-1], (:mask, :mag)) all((≈)(TEs[2]-TEs[1]), TEs[2:end].-TEs[1:end-1]) || throw(DomainError("ARLO requires equidistant echoes")) @@ -115,21 +113,21 @@ function r2star_arlo( vmag = reshape(mag, :, NT) vr2s = vec(r2s) - _zeroT = zero(T) - _twoT = convert(T, 2) - _fourT = convert(T, 4) + zeroT = zero(T) + twoT = convert(T, 2) + fourT = convert(T, 4) α = convert(T, 3 / (TEs[2]-TEs[1])) - if mask === nothing - @inbounds @batch for I in eachindex(vr2s) + @batch for I in eachindex(vr2s) + @inbounds if mask === nothing || mask[I] m0 = vmag[I,1] m1 = vmag[I,2] m2 = vmag[I,3] δ = m0 - m2 - s = m0 + muladd(_fourT, m1, m2) - a = muladd(_twoT, m1, m0) + s = m0 + muladd(fourT, m1, m2) + a = muladd(twoT, m1, m0) num = δ * a den = s * a @@ -140,47 +138,14 @@ function r2star_arlo( m2 = vmag[I,t+2] δ = m0 - m2 - s = m0 + muladd(_fourT, m1, m2) - a = muladd(_twoT, m1, m0) + s = m0 + muladd(fourT, m1, m2) + a = muladd(twoT, m1, m0) num = muladd(δ, a, num) den = muladd(s, a, den) end - vr2s[I] = iszero(den) ? _zeroT : α * num * inv(den) - end - - else - vmask = vec(mask) - - @inbounds @batch for I in eachindex(vr2s) - if vmask[I] - m0 = vmag[I,1] - m1 = vmag[I,2] - m2 = vmag[I,3] - - δ = m0 - m2 - s = m0 + muladd(_fourT, m1, m2) - a = muladd(_twoT, m1, m0) - - num = δ * a - den = s * a - - for t in 2:NT-2 - m0 = m1 - m1 = m2 - m2 = vmag[I,t+2] - - δ = m0 - m2 - s = m0 + muladd(_fourT, m1, m2) - a = muladd(_twoT, m1, m0) - - num = muladd(δ, a, num) - den = muladd(s, a, den) - end - - vr2s[I] = iszero(den) ? _zeroT : α * num * inv(den) - end + vr2s[I] = iszero(den) ? zeroT : α * num * inv(den) end end @@ -232,12 +197,12 @@ function r2star_crsi( ) where {T<:AbstractFloat, N, NT, NR} N > 1 || throw(ArgumentError("array must contain echoes in last dimension")) NT > 1 || throw(ArgumentError("data must be multi-echo")) - - size(mag, N) == NT || throw(DimensionMismatch()) - mask === nothing || length(mask) == length(mag) ÷ NT || throw(DimensionMismatch()) - sigma === nothing || NR == N-1 || throw(DimensionMismatch()) M > 0 || throw(ArgumentError("interpolation factor M must be greater than 0")) + checkshape((NT,), (size(mag, N),), (:TEs, :mag)) + mask !== nothing && checkshape(axes(mask), axes(mag)[1:N-1], (:mask, :mag)) + sigma !== nothing && checkshape((NR,), (N-1,), (:Rsz, :mag)) + P = tmap(x -> x*x, mag) r2s = similar(mag, size(mag)[1:N-1]) r2s = tfill!(r2s, zero(T)) @@ -251,7 +216,7 @@ function r2star_crsi( σ2 = _noise_crsi(P, Rsz, mask) end - _zeroT = zero(T) + zeroT = zero(T) α = convert(T, 1//2) β = convert(T, -α * σ2 * (TEs[end] - TEs[1])) @@ -259,10 +224,10 @@ function r2star_crsi( γ0 = SVector{M+1, T}([(2*M - 2*m + 1) / (2*M + 2) for m in 0:M]...) γ1 = SVector{M+1, T}([(2*m + 1) / (2*M + 2) for m in 0:M]...) - if mask === nothing - @inbounds @batch per=thread for I in eachindex(vr2s) + @batch for I in eachindex(vr2s) + if mask === nothing || mask[I] den = β - for t in Base.OneTo(NT-1) + @inbounds for t in 1:NT-1 p0 = vP[I,t] p1 = vP[I,t+1] p = pow(p0, γ0[1]) * pow(p1, γ1[1]) @@ -272,27 +237,7 @@ function r2star_crsi( den = muladd(τ[t], p, den) end - vr2s[I] = iszero(den) ? _zeroT : α * (vP[I,1] - vP[I,end]) * inv(den) - end - - else - vmask = vec(mask) - - @inbounds @batch per=thread for I in eachindex(vr2s) - if vmask[I] - den = β - for t in Base.OneTo(NT-1) - p0 = vP[I,t] - p1 = vP[I,t+1] - p = pow(p0, γ0[1]) * pow(p1, γ1[1]) - for m in 2:M+1 - p = muladd(pow(p0, γ0[m]), pow(p1, γ1[m]), p) - end - den = muladd(τ[t], p, den) - end - - vr2s[I] = iszero(den) ? _zeroT : α * (vP[I,1] - vP[I,end]) * inv(den) - end + vr2s[I] = iszero(den) ? zeroT : α * (vP[I,1] - vP[I,end]) * inv(den) end end @@ -304,42 +249,31 @@ function _noise_crsi( Rsz::NTuple{M, Integer}, mask::Union{Nothing, AbstractArray{Bool}} = nothing ) where {T<:AbstractFloat, N, M} - M == N-1 || throw(DimensionMismatch()) - - sz = size(P) - rsz = ntuple(n -> min(Rsz[n], (sz[n]-2)÷2), Val(N-1)) + checkshape((M,), (N-1,), (:Rsz, :P)) + mask !== nothing && checkshape(axes(mask), axes(P)[1:3], (:mask, :P)) - lb = rsz .+ 2 - ub = sz[1:M] .- 1 .- rsz + sz = size(P)[1:M] + rsz = map(min, Rsz, (sz.-2).÷2) - outer = CartesianIndices(ntuple(n -> 2:sz[n]-1, Val(N-1))) - inner = CartesianIndices(ntuple(n -> 2+rsz[n]:sz[n]-1-rsz[n], Val(N-1))) + outer = ntuple(d -> 2:sz[d]-1, Val(M)) + inner = ntuple(d -> 2+rsz[d]:sz[d]-1-rsz[d], Val(M)) - n = 0 - σ2 = zero(T) - - if mask === nothing - @inbounds for t in axes(P, N) - for I in EdgeIterator(outer, inner) - if all(n -> I[n] < lb[n] || I[n] > ub[n], 1:M) - σ2 += P[I,t] - n += 1 - end - end - end + m = Ref(0) + s = Ref(zero(T)) - else - @inbounds for t in axes(P, N) - for I in EdgeIterator(outer, inner) - if all(n -> I[n] < lb[n] || I[n] > ub[n], 1:M) && !mask[I] - σ2 += P[I,t] - n += 1 - end + for t in axes(P, N) + _edgeloop(outer, inner) do I... + if all(map(∉, I, inner)) && (mask === nothing || (@inbounds !mask[I...])) + s[] += (@inbounds P[I...,t]) + m[] += 1 end end end - return n == 0 ? zero(T) : σ2 / n + σ2 = s[] + n = m[] + + return iszero(n) ? zero(T) : σ2 / n end @@ -374,8 +308,8 @@ function r2star_numart2s( N > 1 || throw(ArgumentError("array must contain echoes in last dimension")) NT > 1 || throw(ArgumentError("data must be multi-echo")) - size(mag, N) == NT || throw(DimensionMismatch()) - mask === nothing || length(mask) == length(mag) ÷ NT || throw(DimensionMismatch()) + checkshape((NT,), (size(mag, N),), (:TEs, :mag)) + mask !== nothing && checkshape(axes(mask), axes(mag)[1:N-1], (:mask, :mag)) r2s = similar(mag, size(mag)[1:N-1]) r2s = tfill!(r2s, zero(T)) @@ -383,31 +317,17 @@ function r2star_numart2s( vmag = reshape(mag, :, NT) vr2s = vec(r2s) - _zeroT = zero(T) + zeroT = zero(T) α = convert(T, 2*(NT - 1) / (TEs[end] - TEs[1])) - if mask === nothing - @inbounds @batch for I in eachindex(vr2s) + @batch for I in eachindex(vr2s) + @inbounds if mask === nothing || mask[I] den = vmag[I,1] for t in 2:NT-1 den += vmag[I,t] + vmag[I,t] end den += vmag[I,NT] - vr2s[I] = iszero(den) ? _zeroT : α * (vmag[I,1] - vmag[I,NT]) * inv(den) - end - - else - vmask = vec(mask) - - @inbounds @batch for I in eachindex(vr2s) - if vmask[I] - den = vmag[I,1] - for t in 2:NT-1 - den += vmag[I,t] + vmag[I,t] - end - den += vmag[I,NT] - vr2s[I] = iszero(den) ? _zeroT : α * (vmag[I,1] - vmag[I,NT]) * inv(den) - end + vr2s[I] = iszero(den) ? zeroT : α * (vmag[I,1] - vmag[I,NT]) * inv(den) end end diff --git a/src/utils/utils.jl b/src/utils/utils.jl index 06a3f36..b8fa9bc 100644 --- a/src/utils/utils.jl +++ b/src/utils/utils.jl @@ -183,12 +183,12 @@ function _padarray_kernel!(xp::AbstractArray, x::AbstractArray, getindex_pad) ΔI = CartesianIndex((size(xp) .- size(x) .+ 1) .>> 1) # TODO: disable threading for small xp - @inbounds @batch for Ip in CartesianIndices(xp) + @batch for Ip in CartesianIndices(xp) I = Ip - ΔI if any(map(∉, I.I, ax)) - xp[Ip] = getindex_pad(x, I, lo, hi) + @inbounds xp[Ip] = getindex_pad(x, I, lo, hi) else - xp[Ip] = x[I] + @inbounds xp[Ip] = x[I] end end @@ -396,10 +396,10 @@ function erode_mask!( nx, ny, nz = size(m0) for t in 1:iter - @inbounds @batch for k in 1+t:nz-t + @batch for k in 1+t:nz-t for j in 1+t:ny-t for i in 1+t:nx-t - m1[i,j,k] = __erode_kernel(m0, i, j, k) + @inbounds m1[i,j,k] = __erode_kernel(m0, i, j, k) end end end @@ -468,8 +468,8 @@ function psf2otf( _kp = k else _kp = tfill!(similar(k, sz), zero(T)) - @inbounds @batch minbatch=1024 for I in CartesianIndices(k) - _kp[I] = k[I] + @batch minbatch=1024 for I in CartesianIndices(k) + @inbounds _kp[I] = k[I] end end @@ -492,6 +492,128 @@ function psf2otf( end +function edge_indices( + x::AbstractArray{T, N}, + mask::Union{Nothing, AbstractArray{Bool, N}} = nothing +) where {T, N} + edge_indices(axes(x), mask) +end + +function edge_indices( + outer::NTuple{N, AbstractUnitRange{Int}}, + mask::Union{Nothing, AbstractArray{Bool, N}} = nothing +) where {N} + frst = map(first, outer) + lst = map(last, outer) + stp = map(step, outer) + inner = ntuple(d -> frst[d]+stp[d]:lst[d]-stp[d], Val(N)) + edge_indices(outer, inner, mask) +end + +function edge_indices( + outer::NTuple{N, AbstractUnitRange{Int}}, + inner::NTuple{N, AbstractUnitRange{Int}}, + mask::Union{Nothing, AbstractArray{Bool, N}} = nothing +) where {N} + all(first.(inner) .∈ outer) && all(last.(inner) .∈ outer) || + throw(DimensionMismatch("inner$inner must be in the interior of outer$outer")) + + mask !== nothing && checkshape(outer, axes(mask), (:outer, :mask)) + + if mask === nothing + n = prod(map(length, outer)) - prod(map(length, inner)) + + else + m = Ref(0) + _edgeloop(outer, inner) do I... + if (@inbounds mask[I...]) + m[] += 1 + end + end + n = m[] + end + + if iszero(n) + return Vector{NTuple{N, Int}}() + end + + E = Vector{NTuple{N, Int}}(undef, n) + i = Ref(0) + _edgeloop(outer, inner) do I... + if mask === nothing || (@inbounds mask[I...]) + @inbounds E[i[] += 1] = I + end + end + + return E +end + + +# totally necessary unrolled TiledIteration.EdgeIterator loop +edgeloop(f!, outer::CartesianIndices, inner::CartesianIndices) = + edgeloop(f!, outer.indices, inner.indices) + +@inline function edgeloop( + f!, + outer::NTuple{N, AbstractUnitRange{Int}}, + inner::NTuple{N, AbstractUnitRange{Int}}, +) where {N} + all(first.(inner) .∈ outer) && all(last.(inner) .∈ outer) || + throw(DimensionMismatch("inner$inner must be in the interior of outer$outer")) + _edgeloop(f!, outer, inner) +end + +@generated function _edgeloop( + f!::F, + outer::NTuple{N, AbstractUnitRange{Int}}, + inner::NTuple{N, AbstractUnitRange{Int}}, +) where {F, N} + N == 0 && return :(nothing) + I = ntuple(d -> Symbol(:I, d), Val(N)) + + exf! = quote + f!($(I...)) + end + + ex = quote + for $(I[1]) in first(outer[1]):first(inner[1])-1 + $exf! + end + for $(I[1]) in last(inner[1])+1:last(outer[1]) + $exf! + end + end + + for d in 2:N + expp = exf! + for n in 1:d-1 + expp = quote + for $(I[n]) in outer[$n] + $expp + end + end + end + + ex = quote + for $(I[d]) in first(outer[$d]):first(inner[$d])-1 + $expp + end + for $(I[d]) in inner[$d] + $ex + end + for $(I[d]) in last(inner[$d])+1:last(outer[$d]) + $expp + end + end + end + + quote + Base.@_inline_meta + $ex + end +end + + ##### ##### Multi-threaded Base utilities #####