diff --git a/src/QSM.jl b/src/QSM.jl
index e3851c4..7695178 100644
--- a/src/QSM.jl
+++ b/src/QSM.jl
@@ -16,7 +16,7 @@ using SLEEFPirates: pow, sincos_fast
 using Static: known
 using StaticArrays: SVector
 using ThreadingUtilities: initialize_task
-using TiledIteration: EdgeIterator, TileIterator, padded_tilesize
+using TiledIteration: TileIterator, padded_tilesize
 
 using LinearAlgebra
 using FFTW
@@ -52,11 +52,13 @@ end
 #####
 ##### Polyester.jl
 #####
+
 function reset_threading()
-    # after user interrupt during @batch loop, threading has to be reset:
+    # if @batch loop gets interrupted, threading has to be reset:
     # https://github.com/JuliaSIMD/Polyester.jl/issues/30
+    nt = min(nthreads(), (Sys.CPU_THREADS)::Int) - 1
     reset_workers!()
-    foreach(initialize_task, 1:min(nthreads(), (Sys.CPU_THREADS)::Int) - 1)
+    foreach(initialize_task, 1:nt)
     return nothing
 end
 
@@ -64,8 +66,8 @@ end
 #####
 ##### FFTW.jl
 #####
-const FFTW_NTHREADS = Ref{Int}(known(num_cores()))
 
+const FFTW_NTHREADS = Ref{Int}(known(num_cores()))
 
 @static if FFTW.fftw_provider == "fftw"
     # modified `FFTW.spawnloop` to use Polyester for multi-threading
diff --git a/src/unwrap/laplacian.jl b/src/unwrap/laplacian.jl
index 0a4d3a4..123056d 100644
--- a/src/unwrap/laplacian.jl
+++ b/src/unwrap/laplacian.jl
@@ -139,7 +139,7 @@ function wrapped_laplacian!(
 ) where {T<:AbstractFloat}
     checkshape(d2u, u, (:d2u, :u))
 
-    nx, ny, nz, _ = size(u)
+    nx, ny, nz = size(u)[1:3]
     dx2, dy2, dz2 = convert.(T, inv.(dx.*dx))
 
     tsz = padded_tilesize(T, (2, 2, 2), 1)
@@ -173,7 +173,6 @@ end
 ##### Boundary treatment
 #####
 
-# unroll? probably too aggressive
 function wrapped_laplacian_boundary_neumann!(
     d2u::AbstractArray{<:AbstractFloat, N},
     u::AbstractArray{T, N},
@@ -182,26 +181,17 @@ function wrapped_laplacian_boundary_neumann!(
     N ∈ (3, 4) || throw(ArgumentError("arrays must be 3d or 4d, got $(N)d"))
     checkshape(d2u, u, (:d2u, :u))
 
-    sz = size(u)
-    nx, ny, nz = sz[1:3]
+    zeroT = zero(T)
+    nx, ny, nz = size(u)[1:3]
     dx2, dy2, dz2 = convert.(T, inv.(dx.*dx))
 
-    _zero = zero(T)
-
-    outer = CartesianIndices(ntuple(n -> 1:sz[n], Val(3)))
-    inner = CartesianIndices(ntuple(n -> 2:sz[n]-1, Val(3)))
-    E = EdgeIterator(outer, inner)
-
-    R = Vector{NTuple{3, Int}}(undef, length(E))
-    @inbounds for (i, I) in enumerate(E)
-        R[i] = Tuple(I)
-    end
+    R = edge_indices(axes(u)[1:3])
 
     @inbounds for t in axes(u, 4)
         _u = @view(u[:,:,:,t])
         _d2u = @view(d2u[:,:,:,t])
         @batch per=thread for (i, j, k) in R
-            Δ = _zero
+            Δ = zeroT
 
             if k > 1
                 Δ = muladd(-dz2, _wrap(_u[i,j,k] - _u[i,j,k-1]), Δ)
@@ -243,40 +233,32 @@ function wrapped_laplacian_boundary_periodic!(
     N ∈ (3, 4) || throw(ArgumentError("arrays must be 3d or 4d, got $(N)d"))
     checkshape(d2u, u, (:d2u, :u))
 
-    sz = size(u)
-    nx, ny, nz = sz[1:3]
+    nx, ny, nz = size(u)[1:3]
     dx2, dy2, dz2 = convert.(T, inv.(dx.*dx))
 
-    outer = CartesianIndices(ntuple(n -> 1:sz[n], Val(3)))
-    inner = CartesianIndices(ntuple(n -> 2:sz[n]-1, Val(3)))
-    E = EdgeIterator(outer, inner)
-
-    R = Vector{NTuple{3, Int}}(undef, length(E))
-    @inbounds for (i, I) in enumerate(E)
-        R[i] = Tuple(I)
-    end
+    R = edge_indices(axes(u)[1:3])
 
     @inbounds for t in axes(u, 4)
         _u = @view(u[:,:,:,t])
         _d2u = @view(d2u[:,:,:,t])
         @batch per=thread for (i, j, k) in R
-            du = k == 1 ? _u[i,j,k] - _u[i,j,end] : _u[i,j,k] - _u[i,j,k-1]
-            Δ = -dz2 * _wrap(du)
+            v = k == 1 ? _u[i,j,end] : _u[i,j,k-1]
+            Δ = -dz2 * _wrap(_u[i,j,k] - v)
 
-            du = j == 1 ? _u[i,j,k] - _u[i,end,k] : _u[i,j,k] - _u[i,j-1,k]
-            Δ = muladd(-dy2, _wrap(du), Δ)
+            v = j == 1 ? _u[i,end,k] : _u[i,j-1,k]
+            Δ = muladd(-dy2, _wrap(_u[i,j,k] - v), Δ)
 
-            du = i == 1 ? _u[i,j,k] - _u[end,j,k] : _u[i,j,k] - _u[i-1,j,k]
-            Δ = muladd(-dx2, _wrap(du), Δ)
+            v = i == 1 ? _u[end,j,k] : _u[i-1,j,k]
+            Δ = muladd(-dx2, _wrap(_u[i,j,k] - v), Δ)
 
-            du = i == nx ? _u[1,j,k] - _u[i,j,k] : _u[i+1,j,k] - _u[i,j,k]
-            Δ = muladd(dx2, _wrap(du), Δ)
+            v = i == nx ? _u[1,j,k] : _u[i+1,j,k]
+            Δ = muladd(dx2, _wrap(v - _u[i,j,k]), Δ)
 
-            du = j == ny ? _u[i,1,k] - _u[i,j,k] : _u[i,j+1,k] - _u[i,j,k]
-            Δ = muladd(dy2, _wrap(du), Δ)
+            v = j == ny ? _u[i,1,k] : _u[i,j+1,k]
+            Δ = muladd(dy2, _wrap(v - _u[i,j,k]), Δ)
 
-            du = k == nz ? _u[i,j,1] - _u[i,j,k] : _u[i,j,k+1] - _u[i,j,k]
-            Δ = muladd(dz2, _wrap(du), Δ)
+            v = k == nz ? _u[i,j,1] : _u[i,j,k+1]
+            Δ = muladd(dz2, _wrap(v - _u[i,j,k]), Δ)
 
             _d2u[i,j,k] = Δ
         end
diff --git a/src/utils/poisson_solver/multigrid/poisson.jl b/src/utils/poisson_solver/multigrid/poisson.jl
index fb4a560..35113c6 100644
--- a/src/utils/poisson_solver/multigrid/poisson.jl
+++ b/src/utils/poisson_solver/multigrid/poisson.jl
@@ -261,13 +261,10 @@ function boundary_mask!(
 
     b = tzero(mb)
 
-    # array boundary
-    outer = CartesianIndices(ntuple(n -> 2:sz[n]-1, Val(3)))
-    inner = CartesianIndices(ntuple(n -> 3:sz[n]-2-Int(iseven(sz[n])), Val(3)))
-    if !isempty(inner)
-        @inbounds for I in EdgeIterator(outer, inner)
-            b[I] = m[I]
-        end
+    outer = ntuple(n -> 2:sz[n]-1, Val(3))
+    inner = ntuple(n -> 3:sz[n]-2-Int(iseven(sz[n])), Val(3))
+    _edgeloop(outer, inner) do i, j, k
+        @inbounds b[i,j,k] = m[i,j,k]
     end
 
     # interior
diff --git a/src/utils/r2star.jl b/src/utils/r2star.jl
index 6d78b79..a524bc6 100644
--- a/src/utils/r2star.jl
+++ b/src/utils/r2star.jl
@@ -23,8 +23,8 @@ function r2star_ll(
     N > 1 || throw(ArgumentError("array must contain echoes in last dimension"))
     NT > 1 || throw(ArgumentError("data must be multi-echo"))
 
-    size(mag, N) == NT || throw(DimensionMismatch())
-    mask === nothing || length(mask) == length(mag) ÷ NT || throw(DimensionMismatch())
+    checkshape((NT,), (size(mag, N),), (:TEs, :mag))
+    mask !== nothing && checkshape(axes(mask), axes(mag)[1:N-1], (:mask, :mag))
 
     r2s = similar(mag, size(mag)[1:N-1])
     r2s = tfill!(r2s, zero(T))
@@ -37,34 +37,32 @@ function r2star_ll(
     if mask === nothing
         b = tmap(log, transpose(vmag))
         x̂ = ldiv!(A, b)
-        @inbounds @batch for I in eachindex(vr2s)
-            vr2s[I] = x̂[1,I]
+        @batch for I in eachindex(vr2s)
+            @inbounds vr2s[I] = x̂[1,I]
         end
 
     else
-        vmask = vec(mask)
-
         i = 0
-        R = Vector{Int}(undef, sum(vmask))
-        @inbounds for I in eachindex(vmask)
-            if vmask[I]
+        R = Vector{Int}(undef, sum(mask))
+        @inbounds for I in eachindex(mask)
+            if mask[I]
                 R[i += 1] = I
             end
         end
 
         b = similar(mag, (NT, length(R)))
-        @inbounds for t in Base.OneTo(NT)
+        for t in 1:NT
             bt = @view(b[t,:])
             mt = @view(vmag[:,t])
             @batch for I in eachindex(R)
-                bt[I] = log(mt[R[I]])
+                @inbounds bt[I] = log(mt[R[I]])
             end
         end
 
         x̂ = ldiv!(A, b)
 
-        @inbounds @batch for I in eachindex(R)
-            vr2s[R[I]] = x̂[1,I]
+        @batch for I in eachindex(R)
+            @inbounds vr2s[R[I]] = x̂[1,I]
         end
     end
 
@@ -103,8 +101,8 @@ function r2star_arlo(
     N > 1 || throw(ArgumentError("array must contain echoes in last dimension"))
     NT > 2 || throw(ArgumentError("ARLO requires at least 3 echoes"))
 
-    size(mag, N) == NT || throw(DimensionMismatch())
-    mask === nothing || length(mask) == length(mag) ÷ NT || throw(DimensionMismatch())
+    checkshape((NT,), (size(mag, N),), (:TEs, :mag))
+    mask !== nothing && checkshape(axes(mask), axes(mag)[1:N-1], (:mask, :mag))
 
     all((≈)(TEs[2]-TEs[1]), TEs[2:end].-TEs[1:end-1]) ||
         throw(DomainError("ARLO requires equidistant echoes"))
@@ -115,21 +113,21 @@ function r2star_arlo(
     vmag = reshape(mag, :, NT)
     vr2s = vec(r2s)
 
-    _zeroT = zero(T)
-    _twoT  = convert(T, 2)
-    _fourT = convert(T, 4)
+    zeroT = zero(T)
+    twoT  = convert(T, 2)
+    fourT = convert(T, 4)
 
     α = convert(T, 3 / (TEs[2]-TEs[1]))
 
-    if mask === nothing
-        @inbounds @batch for I in eachindex(vr2s)
+    @batch for I in eachindex(vr2s)
+        @inbounds if mask === nothing || mask[I]
             m0 = vmag[I,1]
             m1 = vmag[I,2]
             m2 = vmag[I,3]
 
             δ = m0 - m2
-            s = m0 + muladd(_fourT, m1, m2)
-            a = muladd(_twoT, m1, m0)
+            s = m0 + muladd(fourT, m1, m2)
+            a = muladd(twoT, m1, m0)
 
             num = δ * a
             den = s * a
@@ -140,47 +138,14 @@ function r2star_arlo(
                 m2 = vmag[I,t+2]
 
                 δ = m0 - m2
-                s = m0 + muladd(_fourT, m1, m2)
-                a = muladd(_twoT, m1, m0)
+                s = m0 + muladd(fourT, m1, m2)
+                a = muladd(twoT, m1, m0)
 
                 num = muladd(δ, a, num)
                 den = muladd(s, a, den)
             end
 
-            vr2s[I] = iszero(den) ? _zeroT : α * num * inv(den)
-        end
-
-    else
-        vmask = vec(mask)
-
-        @inbounds @batch for I in eachindex(vr2s)
-            if vmask[I]
-                m0 = vmag[I,1]
-                m1 = vmag[I,2]
-                m2 = vmag[I,3]
-
-                δ = m0 - m2
-                s = m0 + muladd(_fourT, m1, m2)
-                a = muladd(_twoT, m1, m0)
-
-                num = δ * a
-                den = s * a
-
-                for t in 2:NT-2
-                    m0 = m1
-                    m1 = m2
-                    m2 = vmag[I,t+2]
-
-                    δ = m0 - m2
-                    s = m0 + muladd(_fourT, m1, m2)
-                    a = muladd(_twoT, m1, m0)
-
-                    num = muladd(δ, a, num)
-                    den = muladd(s, a, den)
-                end
-
-                vr2s[I] = iszero(den) ? _zeroT : α * num * inv(den)
-            end
+            vr2s[I] = iszero(den) ? zeroT : α * num * inv(den)
         end
     end
 
@@ -232,12 +197,12 @@ function r2star_crsi(
 ) where {T<:AbstractFloat, N, NT, NR}
     N > 1 || throw(ArgumentError("array must contain echoes in last dimension"))
     NT > 1 || throw(ArgumentError("data must be multi-echo"))
-
-    size(mag, N) == NT || throw(DimensionMismatch())
-    mask === nothing || length(mask) == length(mag) ÷ NT || throw(DimensionMismatch())
-    sigma === nothing || NR == N-1 || throw(DimensionMismatch())
     M > 0 || throw(ArgumentError("interpolation factor M must be greater than 0"))
 
+    checkshape((NT,), (size(mag, N),), (:TEs, :mag))
+    mask !== nothing && checkshape(axes(mask), axes(mag)[1:N-1], (:mask, :mag))
+    sigma !== nothing && checkshape((NR,), (N-1,), (:Rsz, :mag))
+
     P = tmap(x -> x*x, mag)
     r2s = similar(mag, size(mag)[1:N-1])
     r2s = tfill!(r2s, zero(T))
@@ -251,7 +216,7 @@ function r2star_crsi(
         σ2 = _noise_crsi(P, Rsz, mask)
     end
 
-    _zeroT = zero(T)
+    zeroT = zero(T)
     α = convert(T, 1//2)
     β = convert(T, -α * σ2 * (TEs[end] - TEs[1]))
 
@@ -259,10 +224,10 @@ function r2star_crsi(
     γ0 = SVector{M+1, T}([(2*M - 2*m + 1) / (2*M + 2) for m in 0:M]...)
     γ1 = SVector{M+1, T}([(2*m + 1) / (2*M + 2) for m in 0:M]...)
 
-    if mask === nothing
-        @inbounds @batch per=thread for I in eachindex(vr2s)
+    @batch for I in eachindex(vr2s)
+        if mask === nothing || mask[I]
             den = β
-            for t in Base.OneTo(NT-1)
+            @inbounds for t in 1:NT-1
                 p0 = vP[I,t]
                 p1 = vP[I,t+1]
                 p = pow(p0, γ0[1]) * pow(p1, γ1[1])
@@ -272,27 +237,7 @@ function r2star_crsi(
                 den = muladd(τ[t], p, den)
             end
 
-            vr2s[I] = iszero(den) ? _zeroT : α * (vP[I,1] - vP[I,end]) * inv(den)
-        end
-
-    else
-        vmask = vec(mask)
-
-        @inbounds @batch per=thread for I in eachindex(vr2s)
-            if vmask[I]
-                den = β
-                for t in Base.OneTo(NT-1)
-                    p0 = vP[I,t]
-                    p1 = vP[I,t+1]
-                    p = pow(p0, γ0[1]) * pow(p1, γ1[1])
-                    for m in 2:M+1
-                        p = muladd(pow(p0, γ0[m]), pow(p1, γ1[m]), p)
-                    end
-                    den = muladd(τ[t], p, den)
-                end
-
-                vr2s[I] = iszero(den) ? _zeroT : α * (vP[I,1] - vP[I,end]) * inv(den)
-            end
+            vr2s[I] = iszero(den) ? zeroT : α * (vP[I,1] - vP[I,end]) * inv(den)
         end
     end
 
@@ -304,42 +249,31 @@ function _noise_crsi(
     Rsz::NTuple{M, Integer},
     mask::Union{Nothing, AbstractArray{Bool}} = nothing
 ) where {T<:AbstractFloat, N, M}
-    M == N-1 || throw(DimensionMismatch())
-
-    sz = size(P)
-    rsz = ntuple(n -> min(Rsz[n], (sz[n]-2)÷2), Val(N-1))
+    checkshape((M,), (N-1,), (:Rsz, :P))
+    mask !== nothing && checkshape(axes(mask), axes(P)[1:3], (:mask, :P))
 
-    lb = rsz .+ 2
-    ub = sz[1:M] .- 1 .- rsz
+    sz = size(P)[1:M]
+    rsz = map(min, Rsz, (sz.-2).÷2)
 
-    outer = CartesianIndices(ntuple(n -> 2:sz[n]-1, Val(N-1)))
-    inner = CartesianIndices(ntuple(n -> 2+rsz[n]:sz[n]-1-rsz[n], Val(N-1)))
+    outer = ntuple(d -> 2:sz[d]-1, Val(M))
+    inner = ntuple(d -> 2+rsz[d]:sz[d]-1-rsz[d], Val(M))
 
-    n = 0
-    σ2 = zero(T)
-
-    if mask === nothing
-        @inbounds for t in axes(P, N)
-            for I in EdgeIterator(outer, inner)
-                if all(n -> I[n] < lb[n] || I[n] > ub[n], 1:M)
-                    σ2 += P[I,t]
-                    n += 1
-                end
-            end
-        end
+    m = Ref(0)
+    s = Ref(zero(T))
 
-    else
-        @inbounds for t in axes(P, N)
-            for I in EdgeIterator(outer, inner)
-                if all(n -> I[n] < lb[n] || I[n] > ub[n], 1:M) && !mask[I]
-                    σ2 += P[I,t]
-                    n += 1
-                end
+    for t in axes(P, N)
+        _edgeloop(outer, inner) do I...
+            if all(map(∉, I, inner)) && (mask === nothing || (@inbounds !mask[I...]))
+                s[] += (@inbounds P[I...,t])
+                m[] += 1
             end
         end
     end
 
-    return n == 0 ? zero(T) : σ2 / n
+    σ2 = s[]
+    n = m[]
+
+    return iszero(n) ? zero(T) : σ2 / n
 end
 
 
@@ -374,8 +308,8 @@ function r2star_numart2s(
     N > 1 || throw(ArgumentError("array must contain echoes in last dimension"))
     NT > 1 || throw(ArgumentError("data must be multi-echo"))
 
-    size(mag, N) == NT || throw(DimensionMismatch())
-    mask === nothing || length(mask) == length(mag) ÷ NT || throw(DimensionMismatch())
+    checkshape((NT,), (size(mag, N),), (:TEs, :mag))
+    mask !== nothing && checkshape(axes(mask), axes(mag)[1:N-1], (:mask, :mag))
 
     r2s = similar(mag, size(mag)[1:N-1])
     r2s = tfill!(r2s, zero(T))
@@ -383,31 +317,17 @@ function r2star_numart2s(
     vmag = reshape(mag, :, NT)
     vr2s = vec(r2s)
 
-    _zeroT = zero(T)
+    zeroT = zero(T)
     α = convert(T, 2*(NT - 1) / (TEs[end] - TEs[1]))
 
-    if mask === nothing
-        @inbounds @batch for I in eachindex(vr2s)
+    @batch for I in eachindex(vr2s)
+        @inbounds if mask === nothing || mask[I]
             den = vmag[I,1]
             for t in 2:NT-1
                 den += vmag[I,t] + vmag[I,t]
             end
             den += vmag[I,NT]
-            vr2s[I] = iszero(den) ? _zeroT : α * (vmag[I,1] - vmag[I,NT]) * inv(den)
-        end
-
-    else
-        vmask = vec(mask)
-
-        @inbounds @batch for I in eachindex(vr2s)
-            if vmask[I]
-                den = vmag[I,1]
-                for t in 2:NT-1
-                    den += vmag[I,t] + vmag[I,t]
-                end
-                den += vmag[I,NT]
-                vr2s[I] = iszero(den) ? _zeroT : α * (vmag[I,1] - vmag[I,NT]) * inv(den)
-            end
+            vr2s[I] = iszero(den) ? zeroT : α * (vmag[I,1] - vmag[I,NT]) * inv(den)
         end
     end
 
diff --git a/src/utils/utils.jl b/src/utils/utils.jl
index 06a3f36..b8fa9bc 100644
--- a/src/utils/utils.jl
+++ b/src/utils/utils.jl
@@ -183,12 +183,12 @@ function _padarray_kernel!(xp::AbstractArray, x::AbstractArray, getindex_pad)
     ΔI = CartesianIndex((size(xp) .- size(x) .+ 1) .>> 1)
 
     # TODO: disable threading for small xp
-    @inbounds @batch for Ip in CartesianIndices(xp)
+    @batch for Ip in CartesianIndices(xp)
         I = Ip - ΔI
         if any(map(∉, I.I, ax))
-            xp[Ip] = getindex_pad(x, I, lo, hi)
+            @inbounds xp[Ip] = getindex_pad(x, I, lo, hi)
         else
-            xp[Ip] = x[I]
+            @inbounds xp[Ip] = x[I]
         end
     end
 
@@ -396,10 +396,10 @@ function erode_mask!(
 
     nx, ny, nz = size(m0)
     for t in 1:iter
-        @inbounds @batch for k in 1+t:nz-t
+        @batch for k in 1+t:nz-t
             for j in 1+t:ny-t
                 for i in 1+t:nx-t
-                    m1[i,j,k] = __erode_kernel(m0, i, j, k)
+                    @inbounds m1[i,j,k] = __erode_kernel(m0, i, j, k)
                 end
             end
         end
@@ -468,8 +468,8 @@ function psf2otf(
         _kp = k
     else
         _kp = tfill!(similar(k, sz), zero(T))
-        @inbounds @batch minbatch=1024 for I in CartesianIndices(k)
-            _kp[I] = k[I]
+        @batch minbatch=1024 for I in CartesianIndices(k)
+            @inbounds _kp[I] = k[I]
         end
     end
 
@@ -492,6 +492,128 @@ function psf2otf(
 end
 
 
+function edge_indices(
+    x::AbstractArray{T, N},
+    mask::Union{Nothing, AbstractArray{Bool, N}} = nothing
+) where {T, N}
+    edge_indices(axes(x), mask)
+end
+
+function edge_indices(
+    outer::NTuple{N, AbstractUnitRange{Int}},
+    mask::Union{Nothing, AbstractArray{Bool, N}} = nothing
+) where {N}
+    frst = map(first, outer)
+    lst = map(last, outer)
+    stp = map(step, outer)
+    inner = ntuple(d -> frst[d]+stp[d]:lst[d]-stp[d], Val(N))
+    edge_indices(outer, inner, mask)
+end
+
+function edge_indices(
+    outer::NTuple{N, AbstractUnitRange{Int}},
+    inner::NTuple{N, AbstractUnitRange{Int}},
+    mask::Union{Nothing, AbstractArray{Bool, N}} = nothing
+) where {N}
+    all(first.(inner) .∈ outer) && all(last.(inner) .∈ outer) ||
+        throw(DimensionMismatch("inner$inner must be in the interior of outer$outer"))
+
+    mask !== nothing && checkshape(outer, axes(mask), (:outer, :mask))
+
+    if mask === nothing
+        n = prod(map(length, outer)) - prod(map(length, inner))
+
+    else
+        m = Ref(0)
+        _edgeloop(outer, inner) do I...
+            if (@inbounds mask[I...])
+                m[] += 1
+            end
+        end
+        n = m[]
+    end
+
+    if iszero(n)
+        return Vector{NTuple{N, Int}}()
+    end
+
+    E = Vector{NTuple{N, Int}}(undef, n)
+    i = Ref(0)
+    _edgeloop(outer, inner) do I...
+        if mask === nothing || (@inbounds mask[I...])
+            @inbounds E[i[] += 1] = I
+        end
+    end
+
+    return E
+end
+
+
+# totally necessary unrolled TiledIteration.EdgeIterator loop
+edgeloop(f!, outer::CartesianIndices, inner::CartesianIndices) =
+    edgeloop(f!, outer.indices, inner.indices)
+
+@inline function edgeloop(
+    f!,
+    outer::NTuple{N, AbstractUnitRange{Int}},
+    inner::NTuple{N, AbstractUnitRange{Int}},
+) where {N}
+    all(first.(inner) .∈ outer) && all(last.(inner) .∈ outer) ||
+        throw(DimensionMismatch("inner$inner must be in the interior of outer$outer"))
+    _edgeloop(f!, outer, inner)
+end
+
+@generated function _edgeloop(
+    f!::F,
+    outer::NTuple{N, AbstractUnitRange{Int}},
+    inner::NTuple{N, AbstractUnitRange{Int}},
+) where {F, N}
+    N == 0 && return :(nothing)
+    I = ntuple(d -> Symbol(:I, d), Val(N))
+
+    exf! = quote
+        f!($(I...))
+    end
+
+    ex = quote
+        for $(I[1]) in first(outer[1]):first(inner[1])-1
+            $exf!
+        end
+        for $(I[1]) in last(inner[1])+1:last(outer[1])
+            $exf!
+        end
+    end
+
+    for d in 2:N
+        expp = exf!
+        for n in 1:d-1
+            expp = quote
+                for $(I[n]) in outer[$n]
+                    $expp
+                end
+            end
+        end
+
+        ex = quote
+            for $(I[d]) in first(outer[$d]):first(inner[$d])-1
+                $expp
+            end
+            for $(I[d]) in inner[$d]
+                $ex
+            end
+            for $(I[d]) in last(inner[$d])+1:last(outer[$d])
+                $expp
+            end
+        end
+    end
+
+    quote
+        Base.@_inline_meta
+        $ex
+    end
+end
+
+
 #####
 ##### Multi-threaded Base utilities
 #####