diff --git a/Project.toml b/Project.toml
index bd8f0b7d29..52bb20f614 100644
--- a/Project.toml
+++ b/Project.toml
@@ -45,12 +45,14 @@ IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
 NBInclude = "0db19996-df87-5ea3-a455-e3a50d440464"
+OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [targets]
-test = ["BlockArrays", "Downloads", "FerriteGmsh", "ForwardDiff", "Gmsh", "IterativeSolvers", "Metis", "Pkg", "NBInclude", "ProgressMeter", "Random", "SHA", "Test", "TimerOutputs", "Logging"]
+test = ["BlockArrays", "Downloads", "FerriteGmsh", "ForwardDiff", "Gmsh", "IterativeSolvers", "Metis", "Pkg", "NBInclude", "OhMyThreads", "ProgressMeter", "Random", "SHA", "TaskLocalValues", "Test", "TimerOutputs", "Logging"]
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
index 4b67ef2b1f..daad98ede8 100644
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.10.5"
 manifest_format = "2.0"
-project_hash = "864d1d29886fc101f4919a6d1c30efd8f4e2ca2e"
+project_hash = "4e52f4aa4cee9f66ec4f633f0ae538fbd227ac5e"
 
 [[deps.ADTypes]]
 git-tree-sha1 = "5a5eafb8344b81b8c2237f8a6f6b3602b3f6180e"
@@ -112,6 +112,28 @@ weakdeps = ["SparseArrays"]
 [[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 
+[[deps.BangBang]]
+deps = ["Accessors", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires"]
+git-tree-sha1 = "e2144b631226d9eeab2d746ca8880b7ccff504ae"
+uuid = "198e06fe-97b7-11e9-32a5-e1d131e6ad66"
+version = "0.4.3"
+
+    [deps.BangBang.extensions]
+    BangBangChainRulesCoreExt = "ChainRulesCore"
+    BangBangDataFramesExt = "DataFrames"
+    BangBangStaticArraysExt = "StaticArrays"
+    BangBangStructArraysExt = "StructArrays"
+    BangBangTablesExt = "Tables"
+    BangBangTypedTablesExt = "TypedTables"
+
+    [deps.BangBang.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+    StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+    StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
+    Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+    TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
+
 [[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
@@ -188,6 +210,12 @@ git-tree-sha1 = "e579c6157598169ad4ef17263bdf3452b4a3e316"
 uuid = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e"
 version = "1.1.0"
 
+[[deps.ChunkSplitters]]
+deps = ["TestItems"]
+git-tree-sha1 = "01d5db8756afc4022b1cf267cfede13245226c72"
+uuid = "ae650224-84b6-46f8-82ea-d812ca08434e"
+version = "2.6.0"
+
 [[deps.CloseOpenIntervals]]
 deps = ["Static", "StaticArrayInterface"]
 git-tree-sha1 = "05ba0d07cd4fd8b7a39541e31a7b0254704ea581"
@@ -796,6 +824,11 @@ git-tree-sha1 = "d1b1b796e47d94588b3757fe84fbf65a5ec4a80d"
 uuid = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
 version = "0.1.5"
 
+[[deps.InitialValues]]
+git-tree-sha1 = "4da0f88e9a39111c2fa3add390ab15f3a44f3ca3"
+uuid = "22cec73e-a1b8-11e9-2c92-598750a2cf9c"
+version = "0.3.1"
+
 [[deps.IntelOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl"]
 git-tree-sha1 = "10bd689145d2c3b2a9844005d01087cc1194e79e"
@@ -1338,6 +1371,12 @@ git-tree-sha1 = "887579a3eb005446d514ab7aeac5d1d027658b8f"
 uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051"
 version = "1.3.5+1"
 
+[[deps.OhMyThreads]]
+deps = ["BangBang", "ChunkSplitters", "StableTasks", "TaskLocalValues"]
+git-tree-sha1 = "6acbf70d8306a38a870be56d1841b2c9a4b17837"
+uuid = "67456a42-1dca-4109-a031-0a68de7e3ad5"
+version = "0.6.2"
+
 [[deps.OpenBLAS_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
@@ -2003,6 +2042,11 @@ weakdeps = ["ChainRulesCore"]
     [deps.SpecialFunctions.extensions]
     SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
 
+[[deps.StableTasks]]
+git-tree-sha1 = "073d5c20d44129b20fe954720b97069579fa403b"
+uuid = "91464d47-22a1-43fe-8b7f-2d57ee82463f"
+version = "0.1.5"
+
 [[deps.Static]]
 deps = ["CommonWorldInvalidations", "IfElse", "PrecompileTools"]
 git-tree-sha1 = "87d51a3ee9a4b0d2fe054bdd3fc2436258db2603"
@@ -2104,6 +2148,11 @@ deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 version = "1.10.0"
 
+[[deps.TaskLocalValues]]
+git-tree-sha1 = "d155450e6dff2a8bc2fcb81dcb194bd98b0aeb46"
+uuid = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"
+version = "0.1.2"
+
 [[deps.TensorCore]]
 deps = ["LinearAlgebra"]
 git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6"
@@ -2120,6 +2169,11 @@ version = "1.16.1"
 deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
 uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+[[deps.TestItems]]
+git-tree-sha1 = "42fd9023fef18b9b78c8343a4e2f3813ffbcefcb"
+uuid = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
+version = "1.0.0"
+
 [[deps.ThreadingUtilities]]
 deps = ["ManualMemory"]
 git-tree-sha1 = "eda08f7e9818eb53661b3deb74e3159460dfbc27"
diff --git a/docs/Project.toml b/docs/Project.toml
index 944be43ceb..ead8045637 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -13,11 +13,13 @@ LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
+OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"
 Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
diff --git a/docs/src/literate-howto/threaded_assembly.jl b/docs/src/literate-howto/threaded_assembly.jl
index fa0526fa9f..8f47cccc75 100644
--- a/docs/src/literate-howto/threaded_assembly.jl
+++ b/docs/src/literate-howto/threaded_assembly.jl
@@ -1,3 +1,6 @@
+# ```@meta
+# Draft = false
+# ```
 # # [Multi-threaded assembly](@id tutorial-threaded-assembly)
 #
 #-
@@ -5,198 +8,310 @@
 #md #     This example is also available as a Jupyter notebook:
 #md #     [`threaded_assembly.ipynb`](@__NBVIEWER_ROOT_URL__/howto/threaded_assembly.ipynb).
 #-
+
+# ## Introduction
+#
+# In this howto we will explore how to use task based multithreading (shared memory
+# parallelism) to speed up the analysis. Some parts of a finite element simulation are
+# trivially parallelizable such as the computation of the local element contributions since
+# each element can be processed independently. However, two things need to be considered in
+# order to parallelize safely:
 #
-# ## Example of a colored grid
+#  - **Modification of shared data**: Although the contributions from all the elements can
+#    be computed independently, eventually they need to be assembled into the global
+#    matrix and vector. Letting each task assemble their own contribution would lead to
+#    race conditions since elements share degrees of freedom with each other. There are
+#    various ways to remedy this, for example:
+#     - **Locking**: By using a lock around the call to `assemble!` we can ensure that only
+#       one task assembles at a time. This is simple to implement but can lead to lock
+#       contention and thus poor performance. Another drawback is that the results will not
+#       be deterministic since floating point operations are neither associative nor
+#       commutative.
+#     - **Assembler task**: By using a designated task for the assembling we (obviously)
+#       ensure that only a single task assembles. The worker tasks (the tasks computing the
+#       element contributions) would then hand off their results to the assemly task. This
+#       can be a useful approach if computing the element contributions is much slower than
+#       the assembly -- otherwise the assembler task can't keep up with the worker tasks.
+#       There might also be some extra overhead because of task switching in the scheduler.
+#       The problem with non-deterministic results still remains.
+#     - **Grid coloring**: By "coloring" the grid such that, within each color, no two
+#       elements share degrees of freedom, we can safely assemble each color in parallel.
+#       Even if concurrently running tasks will write to the global matrix and vector they
+#       will not write to the same memory locations. Note also that this procedure gives
+#       predictable results because for a memory location which, for example, a "red",
+#       a "blue", and a "green" element will contribute to we will always add the red first,
+#       then the blue, and finally the green.
+#  - **Scratch data**: In order to speed up the computation of the element contributions we
+#    typically pre-allocate some data structures that can be reused for every element. Such
+#    scratch data include, for example, the local matrix and vector, and the CellValues.
+#    Each task need their own copy of the scratch data since they will be modified for each
+#    element.
+
+# ## Grid coloring
 #
-# Creates a simple 2D grid and colors it.
-# Save the example grid to a VTK file to show the coloring.
-# No cells with the same color has any shared nodes (dofs).
-# This means that it is safe to assemble in parallel as long as we only assemble
-# one color at a time.
+# Ferrite include functionality to color the grid with the [`create_coloring`](@ref)
+# function. Here we create a simple 2D grid, color it, and export the colors to a VTK file
+# to visualize the result (see *Figure 1*.). Note that no cells with the same color has any
+# shared nodes (dofs). This means that it is safe to assemble in parallel as long as we only
+# assemble one color at a time.
 #
-# For this structured grid the greedy algorithm uses fewer colors, but both algorithms
-# result in colors that contain roughly the same number of elements. For unstructured
-# grids the greedy algorithm can result in colors with very few element. For those
-# cases the workstream algorithm is better since it tries to balance the colors evenly.
+# There are two coloring algorithms implemented: the "workstream" algorithm (from Turcksin
+# et al. [Turcksin2016](@cite)) and a "greedy" algorithm. For this structured grid the
+# greedy algorithm uses fewer colors, but both algorithms result in colors that contain
+# roughly the same number of elements. The workstream algorithm is the default one since it
+# in general results in more balanced colors. For unstructured grids the greedy algorithm
+# can result in colors with very few elements, for example.
 
 using Ferrite, SparseArrays
 
 function create_example_2d_grid()
     grid = generate_grid(Quadrilateral, (10, 10), Vec{2}((0.0, 0.0)), Vec{2}((10.0, 10.0)))
-    colors_workstream = create_coloring(grid; alg=ColoringAlgorithm.WorkStream)
-    colors_greedy = create_coloring(grid; alg=ColoringAlgorithm.Greedy)
+    colors_workstream = create_coloring(grid; alg = ColoringAlgorithm.WorkStream)
+    colors_greedy = create_coloring(grid; alg = ColoringAlgorithm.Greedy)
     VTKGridFile("colored", grid) do vtk
         Ferrite.write_cell_colors(vtk, grid, colors_workstream, "workstream-coloring")
         Ferrite.write_cell_colors(vtk, grid, colors_greedy, "greedy-coloring")
     end
 end
 
-create_example_2d_grid();
+create_example_2d_grid()
 
 # ![](coloring.png)
 #
 # *Figure 1*: Element coloring using the "workstream"-algorithm (left) and the "greedy"-
 # algorithm (right).
 
-# ## Cantilever beam in 3D with threaded assembly
-# We will now look at an example where we assemble the stiffness matrix using multiple
-# threads. We set up a simple grid and create a coloring, then create a DofHandler,
-# and define the material stiffness
+# ## Multithreaded assembly of a cantilever beam in 3D
+#
+# We will now look at an example where we assemble the stiffness matrix and right hand side
+# using multiple threads. The problem setup is a cantilever beam in 3D with a linear elastic
+# material behavior. For this exercise we only focus on the multithreading and are not
+# bothered with boundary conditions. For more details refer to the [tutorial on linear
+# elasticity](../tutorials/linear_elasticity.md).
+
+# ### Setup
+#
+# We define the element routine, material stiffness, grid and DofHandler just like in the
+# [tutorial on linear elasticity](../tutorials/linear_elasticity.md) without discussing it
+# further here.
+
+## Element routine
+function assemble_cell!(Ke::Matrix, fe::Vector, cellvalues::CellValues, C::SymmetricTensor, b::Vec)
+    fill!(Ke, 0)
+    fill!(fe, 0)
+    for q_point in 1:getnquadpoints(cellvalues)
+        dΩ = getdetJdV(cellvalues, q_point)
+        for i in 1:getnbasefunctions(cellvalues)
+            δui = shape_value(cellvalues, q_point, i)
+            fe[i] += (δui ⋅ b) * dΩ
+            ∇δui = shape_symmetric_gradient(cellvalues, q_point, i)
+            for j in 1:getnbasefunctions(cellvalues)
+                ∇uj = shape_symmetric_gradient(cellvalues, q_point, j)
+                Ke[i, j] += (∇δui ⊡ C ⊡ ∇uj) * dΩ
+            end
+        end
+    end
+    return Ke, fe
+end
 
-# #### Grid for the beam
-function create_colored_cantilever_grid(celltype, n)
-    grid = generate_grid(celltype, (10*n, n, n), Vec{3}((0.0, 0.0, 0.0)), Vec{3}((10.0, 1.0, 1.0)))
+## Material stiffness
+function create_material_stiffness()
+    E = 200.0e9
+    ν = 0.3
+    λ = E * ν / ((1 + ν) * (1 - 2ν))
+    μ = E / (2(1 + ν))
+    δ(i, j) = i == j ? 1.0 : 0.0
+    C = SymmetricTensor{4, 3}() do i, j, k, l
+        return λ * δ(i, j) * δ(k, l) + μ * (δ(i, k) * δ(j, l) + δ(i, l) * δ(j, k))
+    end
+    return C
+end
+
+## Grid and grid coloring
+function create_cantilever_grid(n::Int)
+    xmin = Vec{3}((0.0, 0.0, 0.0))
+    xmax = Vec{3}((10.0, 1.0, 1.0))
+    grid = generate_grid(Hexahedron, (10 * n, n, n), xmin, xmax)
     colors = create_coloring(grid)
     return grid, colors
-end;
+end
 
-# #### DofHandler
-function create_dofhandler(grid::Grid{dim}, ip) where {dim}
+## DofHandler with displacement field u
+function create_dofhandler(grid::Grid, interpolation::VectorInterpolation)
     dh = DofHandler(grid)
-    add!(dh, :u, ip) # Add a displacement field
+    add!(dh, :u, interpolation)
     close!(dh)
-end;
-
-# ### Stiffness tensor for linear elasticity
-function create_stiffness(::Val{dim}) where {dim}
-    E = 200e9
-    ν = 0.3
-    λ = E*ν / ((1+ν) * (1 - 2ν))
-    μ = E / (2(1+ν))
-    δ(i,j) = i == j ? 1.0 : 0.0
-    g(i,j,k,l) = λ*δ(i,j)*δ(k,l) + μ*(δ(i,k)*δ(j,l) + δ(i,l)*δ(j,k))
-    C = SymmetricTensor{4, dim}(g);
-    return C
-end;
+    return dh
+end
+nothing # hide
 
-# ## Threaded data structures
+# ### Task local scratch data
 #
-# ScratchValues is a thread-local collection of data that each thread needs to own,
-# since we need to be able to mutate the data in the threads independently
-struct ScratchValues{T, CV <: CellValues, FV <: FacetValues, TT <: AbstractTensor, dim, AT}
+# We group everything that needs to be duplicated for each task in the struct
+# `ScratchData`:
+#  - `cell_cache::CellCache`: contain buffers for coordinates and (global) dofs which will
+#    be `reinit!`ed for each cell.
+#  - `cellvalues::CellValues`: the cell values which will be `reinit!`ed for each cell using
+#    the `cell_cache`
+#  - `Ke::Matrix`: the local matrix
+#  - `fe::Vector`: the local vector
+#  - `assembler`: the assembler (which needs to be duplicated because it contains buffers
+#    that are modified during the call to `assemble!`)
+struct ScratchData{CC, CV, T, A}
+    cell_cache::CC
+    cellvalues::CV
     Ke::Matrix{T}
     fe::Vector{T}
-    cellvalues::CV
-    facetvalues::FV
-    global_dofs::Vector{Int}
-    ɛ::Vector{TT}
-    coordinates::Vector{Vec{dim, T}}
-    assembler::AT
-end;
-
-# Each thread need its own CellValues and FacetValues (although, for this example we don't use
-# the FacetValues)
-function create_values(interpolation_space::Interpolation{refshape}, qr_order::Int) where {dim, refshape<:Ferrite.AbstractRefShape{dim}}
-    ## Interpolations and values
-    quadrature_rule = QuadratureRule{refshape}(qr_order)
-    facet_quadrature_rule = FacetQuadratureRule{refshape}(qr_order)
-    cellvalues = [CellValues(quadrature_rule, interpolation_space) for i in 1:Threads.nthreads()];
-    facetvalues = [FacetValues(facet_quadrature_rule, interpolation_space) for i in 1:Threads.nthreads()];
-    return cellvalues, facetvalues
-end;
-
-# Create a `ScratchValues` for each thread with the thread local data
-function create_scratchvalues(K, f, dh::DofHandler{dim}, ip) where {dim}
-    nthreads = Threads.nthreads()
-    assemblers = [start_assemble(K, f) for i in 1:nthreads]
-    cellvalues, facetvalues = create_values(ip, 2)
-
-    n_basefuncs = getnbasefunctions(cellvalues[1])
-    global_dofs = [zeros(Int, ndofs_per_cell(dh)) for i in 1:nthreads]
-
-    fes = [zeros(n_basefuncs) for i in 1:nthreads] # Local force vector
-    Kes = [zeros(n_basefuncs, n_basefuncs) for i in 1:nthreads]
-
-    ɛs = [[zero(SymmetricTensor{2, dim}) for i in 1:n_basefuncs] for i in 1:nthreads]
-
-    coordinates = [[zero(Vec{dim}) for i in 1:length(dh.grid.cells[1].nodes)] for i in 1:nthreads]
-
-    return [ScratchValues(Kes[i], fes[i], cellvalues[i], facetvalues[i], global_dofs[i],
-                         ɛs[i], coordinates[i], assemblers[i]) for i in 1:nthreads]
-end;
-
-# ## Threaded assemble
+    assembler::A
+end
 
-# The assembly function loops over each color and does a threaded assembly for that color
-function doassemble(K::SparseMatrixCSC, colors, grid::Grid, dh::DofHandler, C::SymmetricTensor{4, dim}, ip) where {dim}
+# This constructor will be called within each task to create a independent `ScratchData`
+# object. For `cell_cache`, `Ke`, and `fe` we simply call the constructors to allocate
+# independent objects. For `cellvalues` we use `copy` which Ferrite defines for this
+# purpose. Finally, for the assembler we call `start_assemble` to create a new assembler but
+# note that we set `fillzero = false` because we don't want to risk that a task that starts
+# a bit later will zero out data that another task have already assembled.
+function ScratchData(dh::DofHandler, K::SparseMatrixCSC, f::Vector, cellvalues::CellValues)
+    cell_cache = CellCache(dh)
+    n = ndofs_per_cell(dh)
+    Ke = zeros(n, n)
+    fe = zeros(n)
+    asm = start_assemble(K, f; fillzero = false)
+    return ScratchData(cell_cache, copy(cellvalues), Ke, fe, asm)
+end
+nothing # hide
 
-    f = zeros(ndofs(dh))
-    scratches = create_scratchvalues(K, f, dh, ip)
-    b = Vec{3}((0.0, 0.0, 0.0)) # Body force
+# ### Global assembly routine
 
+# Finally we define the global assemble routine, which is where the parallelization happens.
+# The main difference from all previous `assemble_global!` functions is that we now have an
+# outer loop over the colors, and then the inner loop over the cells in each color, which
+# can be parallelized.
+#
+# For the scheduling of parallel tasks we use the
+# [OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl) package. OhMyThreads
+# provides a macro based and a functional API. Here we use the macro based API because it is
+# slightly more convenient when using task local values since they can be defined with the
+# `@local` macro.
+#
+# !!! note "Schedulers and load balancing"
+#     OhMyThreads provides a number of different
+#     [schedulers](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Schedulers).
+#     In this example we use the `DynamicScheduler` (which is the default one). The
+#     `DynamicScheduler` will spawn `ntasks` tasks where each task will process a chunk of
+#     (roughly) equal number of cells (i.e. `length(color) ÷ ntasks`). This should be a good
+#     choice for this example because we expect all cells to take the same time to process
+#     and we don't need any load balancing.
+#
+#     For a different problem setup where some cells might take longer to process (perhaps
+#     they experience plastic deformation and we need to solve a local problem) we might
+#     benefit from load balancing. The `DynamicScheduler` can be used also for load
+#     balancing by specifiying `nchunks` or `chunksize`. However, the `DynamicScheduler`
+#     will always spawn `nchunks` tasks which can become costly since we are allocating
+#     scratch data for every task. To limit the number of tasks, while allowing for more
+#     than `ntasks` chunks, we can use the `GreedyScheduler` *with chunking*. For example,
+#     `scheduler = OhMyThreads.GreedyScheduler(; ntasks = ntasks, nchunks = 10 * ntasks)`
+#     will split the work into `10 * ntasks` chunks and spawn `ntasks` tasks to process
+#     them. Refer to the [OhMyThreads
+#     documentation](https://juliafolds2.github.io/OhMyThreads.jl/stable/) for details.
+
+using OhMyThreads, TaskLocalValues
+
+function assemble_global!(
+        K::SparseMatrixCSC, f::Vector, dh::DofHandler, colors,
+        cellvalues_template::CellValues; ntasks = Threads.nthreads()
+    )
+    ## Zero-out existing data in K and f
+    _ = start_assemble(K, f)
+    ## Body force and material stiffness
+    b = Vec{3}((0.0, 0.0, -1.0))
+    C = create_material_stiffness()
+    ## Loop over the colors
     for color in colors
-        ## Each color is safe to assemble threaded
-        Threads.@threads :static for i in 1:length(color)
-            assemble_cell!(scratches[Threads.threadid()], color[i], K, grid, dh, C, b)
+        ## Dynamic scheduler spawning `ntasks` tasks where each task will process a chunk of
+        ## (roughly) equal number of cells (`length(color) ÷ ntasks`).
+        scheduler = OhMyThreads.DynamicScheduler(; ntasks)
+        ## Parallelize the loop over the cells in this color
+        OhMyThreads.@tasks for cellidx in color
+            ## Tell the @tasks loop to use the scheduler defined above
+            @set scheduler = scheduler
+            ## Obtain a task local scratch and unpack it
+            @local scratch = ScratchData(dh, K, f, cellvalues_template)
+            (; cell_cache, cellvalues, Ke, fe, assembler) = scratch
+            ## Reinitialize the cell cache and then the cellvalues
+            reinit!(cell_cache, cellidx)
+            reinit!(cellvalues, cell_cache)
+            ## Compute the local contribution of the cell
+            assemble_cell!(Ke, fe, cellvalues, C, b)
+            ## Assemble local contribution
+            assemble!(assembler, celldofs(cell_cache), Ke, fe)
         end
     end
-
     return K, f
 end
-
-# The cell assembly function is written the same way as if it was a single threaded example.
-# The only difference is that we unpack the variables from our `scratch`.
-function assemble_cell!(scratch::ScratchValues, cell::Int, K::SparseMatrixCSC,
-                        grid::Grid, dh::DofHandler, C::SymmetricTensor{4, dim}, b::Vec{dim}) where {dim}
-
-    ## Unpack our stuff from the scratch
-    Ke, fe, cellvalues, facetvalues, global_dofs, ɛ, coordinates, assembler =
-         scratch.Ke, scratch.fe, scratch.cellvalues, scratch.facetvalues,
-         scratch.global_dofs, scratch.ɛ, scratch.coordinates, scratch.assembler
-
-    fill!(Ke, 0)
-    fill!(fe, 0)
-
-    n_basefuncs = getnbasefunctions(cellvalues)
-
-    ## Fill up the coordinates
-    nodeids = grid.cells[cell].nodes
-    for j in 1:length(coordinates)
-        coordinates[j] = grid.nodes[nodeids[j]].x
-    end
-
-    reinit!(cellvalues, coordinates)
-
-    for q_point in 1:getnquadpoints(cellvalues)
-        for i in 1:n_basefuncs
-            ɛ[i] = symmetric(shape_gradient(cellvalues, q_point, i))
-        end
-        dΩ = getdetJdV(cellvalues, q_point)
-        for i in 1:n_basefuncs
-            δu = shape_value(cellvalues, q_point, i)
-            fe[i] += (δu ⋅ b) * dΩ
-            ɛC = ɛ[i] ⊡ C
-            for j in 1:n_basefuncs
-                Ke[i, j] += (ɛC ⊡ ɛ[j]) * dΩ
-            end
-        end
-    end
-
-    celldofs!(global_dofs, dh, cell)
-    assemble!(assembler, global_dofs, Ke, fe)
-end;
-
-function run_assemble()
-    n = 20
-    grid, colors = create_colored_cantilever_grid(Hexahedron, n);
-    ip = Lagrange{RefHexahedron,1}()^3
-    dh = create_dofhandler(grid, ip);
-
-    K = allocate_matrix(dh);
-    C = create_stiffness(Val{3}());
-    ## compilation
-    doassemble(K, colors, grid, dh, C, ip);
-    b = @elapsed @time K, f = doassemble(K, colors, grid, dh, C, ip);
-    return b
+nothing # hide
+
+# !!! details "OhMyThreads functional API: OhMyThreads.tforeach"
+#     The `OhMyThreads.@tasks` block above corresponds to a call to `OhMyThreads.tforeach`.
+#     Using the functional API directly would look like below. The main difference is that
+#     we need to manually create a `TaskLocalValue` for the scratch data.
+#     ```julia
+#     # using TaskLocalValues
+#     scratches = TaskLocalValue() do
+#         ScratchData(dh, K, f, cellvalues)
+#     end
+#     OhMyThreads.tforeach(color; scheduler) do cellidx
+#         # Obtain a task local scratch and unpack it
+#         scratch = scratches[]
+#         (; cell_cache, cellvalues, Ke, fe, assembler) = scratch
+#         # Reinitialize the cell cache and then the cellvalues
+#         reinit!(cell_cache, cellidx)
+#         reinit!(cellvalues, cell_cache)
+#         # Compute the local contribution of the cell
+#         assemble_cell!(Ke, fe, cellvalues, C, b)
+#         # Assemble local contribution
+#         assemble!(assembler, celldofs(cell_cache), Ke, fe)
+#     end
+#     ```
+
+# We define the main function to setup everything and then time the call to
+# `assemble_global!`.
+
+function main(; n = 20, ntasks = Threads.nthreads())
+    ## Interpolation, quadrature and cellvalues
+    interpolation = Lagrange{RefHexahedron, 1}()^3
+    quadrature = QuadratureRule{RefHexahedron}(2)
+    cellvalues = CellValues(quadrature, interpolation)
+    ## Grid, colors and DofHandler
+    grid, colors = create_cantilever_grid(n)
+    dh = create_dofhandler(grid, interpolation)
+    ## Global matrix and vector
+    K = allocate_matrix(dh)
+    f = zeros(ndofs(dh))
+    ## Compile it
+    assemble_global!(K, f, dh, colors, cellvalues; ntasks = ntasks)
+    ## Time it
+    @time assemble_global!(K, f, dh, colors, cellvalues; ntasks = ntasks)
+    return norm(K.nzval), norm(f) #src
+    return
 end
-
-run_assemble()
-
-# Running the code with different number of threads give the following runtimes:
-# * 1 thread  2.46 seconds
-# * 2 threads 1.19 seconds
-# * 3 threads 0.83 seconds
-# * 4 threads 0.75 seconds
+nothing # hide
+
+# On a machine with 4 cores, starting julia with `--threads=auto`, we obtain the following
+# timings:
+# ```julia
+# main(; ntasks = 1) # 1.970784 seconds (902 allocations: 816.172 KiB)
+# main(; ntasks = 2) # 1.025065 seconds (1.64 k allocations: 1.564 MiB)
+# main(; ntasks = 3) # 0.700423 seconds (2.38 k allocations: 2.332 MiB)
+# main(; ntasks = 4) # 0.548356 seconds (3.12 k allocations: 3.099 MiB)
+# ```
+
+using Test                           #src
+nK1, nf1 = main(; n = 5, ntasks = 1) #src
+nK2, nf2 = main(; n = 5, ntasks = 2) #src
+nK4, nf4 = main(; n = 5, ntasks = 4) #src
+@test nK1 == nK2 == nK4              #src
+@test nf1 == nf2 == nf4              #src
 
 #md # ## [Plain program](@id threaded_assembly-plain-program)
 #md #
diff --git a/test/test_examples.jl b/test/test_examples.jl
index d20001d18f..dca1ed73cf 100644
--- a/test/test_examples.jl
+++ b/test/test_examples.jl
@@ -61,3 +61,14 @@ if !Sys.iswindows()
     end
 end
 end
+
+module TestMultiThreading
+    mktempdir() do dir
+        cd(dir) do
+            # Silence the @time output
+            redirect_stdout(devnull) do
+                include(joinpath(@__DIR__, "../docs/src/literate-howto/threaded_assembly.jl"))
+            end
+        end
+    end
+end