diff --git a/Project.toml b/Project.toml index bd8f0b7d29..52bb20f614 100644 --- a/Project.toml +++ b/Project.toml @@ -45,12 +45,14 @@ IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b" NBInclude = "0db19996-df87-5ea3-a455-e3a50d440464" +OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" +TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" [targets] -test = ["BlockArrays", "Downloads", "FerriteGmsh", "ForwardDiff", "Gmsh", "IterativeSolvers", "Metis", "Pkg", "NBInclude", "ProgressMeter", "Random", "SHA", "Test", "TimerOutputs", "Logging"] +test = ["BlockArrays", "Downloads", "FerriteGmsh", "ForwardDiff", "Gmsh", "IterativeSolvers", "Metis", "Pkg", "NBInclude", "OhMyThreads", "ProgressMeter", "Random", "SHA", "TaskLocalValues", "Test", "TimerOutputs", "Logging"] diff --git a/docs/Manifest.toml b/docs/Manifest.toml index 4b67ef2b1f..daad98ede8 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.10.5" manifest_format = "2.0" -project_hash = "864d1d29886fc101f4919a6d1c30efd8f4e2ca2e" +project_hash = "4e52f4aa4cee9f66ec4f633f0ae538fbd227ac5e" [[deps.ADTypes]] git-tree-sha1 = "5a5eafb8344b81b8c2237f8a6f6b3602b3f6180e" @@ -112,6 +112,28 @@ weakdeps = ["SparseArrays"] [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +[[deps.BangBang]] +deps = ["Accessors", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires"] +git-tree-sha1 = "e2144b631226d9eeab2d746ca8880b7ccff504ae" +uuid = "198e06fe-97b7-11e9-32a5-e1d131e6ad66" +version = "0.4.3" + + [deps.BangBang.extensions] + BangBangChainRulesCoreExt = "ChainRulesCore" + BangBangDataFramesExt = "DataFrames" + BangBangStaticArraysExt = "StaticArrays" + BangBangStructArraysExt = "StructArrays" + BangBangTablesExt = "Tables" + BangBangTypedTablesExt = "TypedTables" + + [deps.BangBang.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" + StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" + StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" + Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" + TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" + [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" @@ -188,6 +210,12 @@ git-tree-sha1 = "e579c6157598169ad4ef17263bdf3452b4a3e316" uuid = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e" version = "1.1.0" +[[deps.ChunkSplitters]] +deps = ["TestItems"] +git-tree-sha1 = "01d5db8756afc4022b1cf267cfede13245226c72" +uuid = "ae650224-84b6-46f8-82ea-d812ca08434e" +version = "2.6.0" + [[deps.CloseOpenIntervals]] deps = ["Static", "StaticArrayInterface"] git-tree-sha1 = "05ba0d07cd4fd8b7a39541e31a7b0254704ea581" @@ -796,6 +824,11 @@ git-tree-sha1 = "d1b1b796e47d94588b3757fe84fbf65a5ec4a80d" uuid = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" version = "0.1.5" +[[deps.InitialValues]] +git-tree-sha1 = "4da0f88e9a39111c2fa3add390ab15f3a44f3ca3" +uuid = "22cec73e-a1b8-11e9-2c92-598750a2cf9c" +version = "0.3.1" + [[deps.IntelOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl"] git-tree-sha1 = "10bd689145d2c3b2a9844005d01087cc1194e79e" @@ -1338,6 +1371,12 @@ git-tree-sha1 = "887579a3eb005446d514ab7aeac5d1d027658b8f" uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051" version = "1.3.5+1" +[[deps.OhMyThreads]] +deps = ["BangBang", "ChunkSplitters", "StableTasks", "TaskLocalValues"] +git-tree-sha1 = "6acbf70d8306a38a870be56d1841b2c9a4b17837" +uuid = "67456a42-1dca-4109-a031-0a68de7e3ad5" +version = "0.6.2" + [[deps.OpenBLAS_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" @@ -2003,6 +2042,11 @@ weakdeps = ["ChainRulesCore"] [deps.SpecialFunctions.extensions] SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" +[[deps.StableTasks]] +git-tree-sha1 = "073d5c20d44129b20fe954720b97069579fa403b" +uuid = "91464d47-22a1-43fe-8b7f-2d57ee82463f" +version = "0.1.5" + [[deps.Static]] deps = ["CommonWorldInvalidations", "IfElse", "PrecompileTools"] git-tree-sha1 = "87d51a3ee9a4b0d2fe054bdd3fc2436258db2603" @@ -2104,6 +2148,11 @@ deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" version = "1.10.0" +[[deps.TaskLocalValues]] +git-tree-sha1 = "d155450e6dff2a8bc2fcb81dcb194bd98b0aeb46" +uuid = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34" +version = "0.1.2" + [[deps.TensorCore]] deps = ["LinearAlgebra"] git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6" @@ -2120,6 +2169,11 @@ version = "1.16.1" deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +[[deps.TestItems]] +git-tree-sha1 = "42fd9023fef18b9b78c8343a4e2f3813ffbcefcb" +uuid = "1c621080-faea-4a02-84b6-bbd5e436b8fe" +version = "1.0.0" + [[deps.ThreadingUtilities]] deps = ["ManualMemory"] git-tree-sha1 = "eda08f7e9818eb53661b3deb74e3159460dfbc27" diff --git a/docs/Project.toml b/docs/Project.toml index 944be43ceb..ead8045637 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -13,11 +13,13 @@ LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255" LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" +OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5" Optim = "429524aa-4258-5aef-a3af-852621145aeb" OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34" Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" diff --git a/docs/src/literate-howto/threaded_assembly.jl b/docs/src/literate-howto/threaded_assembly.jl index fa0526fa9f..8f47cccc75 100644 --- a/docs/src/literate-howto/threaded_assembly.jl +++ b/docs/src/literate-howto/threaded_assembly.jl @@ -1,3 +1,6 @@ +# ```@meta +# Draft = false +# ``` # # [Multi-threaded assembly](@id tutorial-threaded-assembly) # #- @@ -5,198 +8,310 @@ #md # This example is also available as a Jupyter notebook: #md # [`threaded_assembly.ipynb`](@__NBVIEWER_ROOT_URL__/howto/threaded_assembly.ipynb). #- + +# ## Introduction +# +# In this howto we will explore how to use task based multithreading (shared memory +# parallelism) to speed up the analysis. Some parts of a finite element simulation are +# trivially parallelizable such as the computation of the local element contributions since +# each element can be processed independently. However, two things need to be considered in +# order to parallelize safely: # -# ## Example of a colored grid +# - **Modification of shared data**: Although the contributions from all the elements can +# be computed independently, eventually they need to be assembled into the global +# matrix and vector. Letting each task assemble their own contribution would lead to +# race conditions since elements share degrees of freedom with each other. There are +# various ways to remedy this, for example: +# - **Locking**: By using a lock around the call to `assemble!` we can ensure that only +# one task assembles at a time. This is simple to implement but can lead to lock +# contention and thus poor performance. Another drawback is that the results will not +# be deterministic since floating point operations are neither associative nor +# commutative. +# - **Assembler task**: By using a designated task for the assembling we (obviously) +# ensure that only a single task assembles. The worker tasks (the tasks computing the +# element contributions) would then hand off their results to the assemly task. This +# can be a useful approach if computing the element contributions is much slower than +# the assembly -- otherwise the assembler task can't keep up with the worker tasks. +# There might also be some extra overhead because of task switching in the scheduler. +# The problem with non-deterministic results still remains. +# - **Grid coloring**: By "coloring" the grid such that, within each color, no two +# elements share degrees of freedom, we can safely assemble each color in parallel. +# Even if concurrently running tasks will write to the global matrix and vector they +# will not write to the same memory locations. Note also that this procedure gives +# predictable results because for a memory location which, for example, a "red", +# a "blue", and a "green" element will contribute to we will always add the red first, +# then the blue, and finally the green. +# - **Scratch data**: In order to speed up the computation of the element contributions we +# typically pre-allocate some data structures that can be reused for every element. Such +# scratch data include, for example, the local matrix and vector, and the CellValues. +# Each task need their own copy of the scratch data since they will be modified for each +# element. + +# ## Grid coloring # -# Creates a simple 2D grid and colors it. -# Save the example grid to a VTK file to show the coloring. -# No cells with the same color has any shared nodes (dofs). -# This means that it is safe to assemble in parallel as long as we only assemble -# one color at a time. +# Ferrite include functionality to color the grid with the [`create_coloring`](@ref) +# function. Here we create a simple 2D grid, color it, and export the colors to a VTK file +# to visualize the result (see *Figure 1*.). Note that no cells with the same color has any +# shared nodes (dofs). This means that it is safe to assemble in parallel as long as we only +# assemble one color at a time. # -# For this structured grid the greedy algorithm uses fewer colors, but both algorithms -# result in colors that contain roughly the same number of elements. For unstructured -# grids the greedy algorithm can result in colors with very few element. For those -# cases the workstream algorithm is better since it tries to balance the colors evenly. +# There are two coloring algorithms implemented: the "workstream" algorithm (from Turcksin +# et al. [Turcksin2016](@cite)) and a "greedy" algorithm. For this structured grid the +# greedy algorithm uses fewer colors, but both algorithms result in colors that contain +# roughly the same number of elements. The workstream algorithm is the default one since it +# in general results in more balanced colors. For unstructured grids the greedy algorithm +# can result in colors with very few elements, for example. using Ferrite, SparseArrays function create_example_2d_grid() grid = generate_grid(Quadrilateral, (10, 10), Vec{2}((0.0, 0.0)), Vec{2}((10.0, 10.0))) - colors_workstream = create_coloring(grid; alg=ColoringAlgorithm.WorkStream) - colors_greedy = create_coloring(grid; alg=ColoringAlgorithm.Greedy) + colors_workstream = create_coloring(grid; alg = ColoringAlgorithm.WorkStream) + colors_greedy = create_coloring(grid; alg = ColoringAlgorithm.Greedy) VTKGridFile("colored", grid) do vtk Ferrite.write_cell_colors(vtk, grid, colors_workstream, "workstream-coloring") Ferrite.write_cell_colors(vtk, grid, colors_greedy, "greedy-coloring") end end -create_example_2d_grid(); +create_example_2d_grid() # ![](coloring.png) # # *Figure 1*: Element coloring using the "workstream"-algorithm (left) and the "greedy"- # algorithm (right). -# ## Cantilever beam in 3D with threaded assembly -# We will now look at an example where we assemble the stiffness matrix using multiple -# threads. We set up a simple grid and create a coloring, then create a DofHandler, -# and define the material stiffness +# ## Multithreaded assembly of a cantilever beam in 3D +# +# We will now look at an example where we assemble the stiffness matrix and right hand side +# using multiple threads. The problem setup is a cantilever beam in 3D with a linear elastic +# material behavior. For this exercise we only focus on the multithreading and are not +# bothered with boundary conditions. For more details refer to the [tutorial on linear +# elasticity](../tutorials/linear_elasticity.md). + +# ### Setup +# +# We define the element routine, material stiffness, grid and DofHandler just like in the +# [tutorial on linear elasticity](../tutorials/linear_elasticity.md) without discussing it +# further here. + +## Element routine +function assemble_cell!(Ke::Matrix, fe::Vector, cellvalues::CellValues, C::SymmetricTensor, b::Vec) + fill!(Ke, 0) + fill!(fe, 0) + for q_point in 1:getnquadpoints(cellvalues) + dΩ = getdetJdV(cellvalues, q_point) + for i in 1:getnbasefunctions(cellvalues) + δui = shape_value(cellvalues, q_point, i) + fe[i] += (δui ⋅ b) * dΩ + ∇δui = shape_symmetric_gradient(cellvalues, q_point, i) + for j in 1:getnbasefunctions(cellvalues) + ∇uj = shape_symmetric_gradient(cellvalues, q_point, j) + Ke[i, j] += (∇δui ⊡ C ⊡ ∇uj) * dΩ + end + end + end + return Ke, fe +end -# #### Grid for the beam -function create_colored_cantilever_grid(celltype, n) - grid = generate_grid(celltype, (10*n, n, n), Vec{3}((0.0, 0.0, 0.0)), Vec{3}((10.0, 1.0, 1.0))) +## Material stiffness +function create_material_stiffness() + E = 200.0e9 + ν = 0.3 + λ = E * ν / ((1 + ν) * (1 - 2ν)) + μ = E / (2(1 + ν)) + δ(i, j) = i == j ? 1.0 : 0.0 + C = SymmetricTensor{4, 3}() do i, j, k, l + return λ * δ(i, j) * δ(k, l) + μ * (δ(i, k) * δ(j, l) + δ(i, l) * δ(j, k)) + end + return C +end + +## Grid and grid coloring +function create_cantilever_grid(n::Int) + xmin = Vec{3}((0.0, 0.0, 0.0)) + xmax = Vec{3}((10.0, 1.0, 1.0)) + grid = generate_grid(Hexahedron, (10 * n, n, n), xmin, xmax) colors = create_coloring(grid) return grid, colors -end; +end -# #### DofHandler -function create_dofhandler(grid::Grid{dim}, ip) where {dim} +## DofHandler with displacement field u +function create_dofhandler(grid::Grid, interpolation::VectorInterpolation) dh = DofHandler(grid) - add!(dh, :u, ip) # Add a displacement field + add!(dh, :u, interpolation) close!(dh) -end; - -# ### Stiffness tensor for linear elasticity -function create_stiffness(::Val{dim}) where {dim} - E = 200e9 - ν = 0.3 - λ = E*ν / ((1+ν) * (1 - 2ν)) - μ = E / (2(1+ν)) - δ(i,j) = i == j ? 1.0 : 0.0 - g(i,j,k,l) = λ*δ(i,j)*δ(k,l) + μ*(δ(i,k)*δ(j,l) + δ(i,l)*δ(j,k)) - C = SymmetricTensor{4, dim}(g); - return C -end; + return dh +end +nothing # hide -# ## Threaded data structures +# ### Task local scratch data # -# ScratchValues is a thread-local collection of data that each thread needs to own, -# since we need to be able to mutate the data in the threads independently -struct ScratchValues{T, CV <: CellValues, FV <: FacetValues, TT <: AbstractTensor, dim, AT} +# We group everything that needs to be duplicated for each task in the struct +# `ScratchData`: +# - `cell_cache::CellCache`: contain buffers for coordinates and (global) dofs which will +# be `reinit!`ed for each cell. +# - `cellvalues::CellValues`: the cell values which will be `reinit!`ed for each cell using +# the `cell_cache` +# - `Ke::Matrix`: the local matrix +# - `fe::Vector`: the local vector +# - `assembler`: the assembler (which needs to be duplicated because it contains buffers +# that are modified during the call to `assemble!`) +struct ScratchData{CC, CV, T, A} + cell_cache::CC + cellvalues::CV Ke::Matrix{T} fe::Vector{T} - cellvalues::CV - facetvalues::FV - global_dofs::Vector{Int} - ɛ::Vector{TT} - coordinates::Vector{Vec{dim, T}} - assembler::AT -end; - -# Each thread need its own CellValues and FacetValues (although, for this example we don't use -# the FacetValues) -function create_values(interpolation_space::Interpolation{refshape}, qr_order::Int) where {dim, refshape<:Ferrite.AbstractRefShape{dim}} - ## Interpolations and values - quadrature_rule = QuadratureRule{refshape}(qr_order) - facet_quadrature_rule = FacetQuadratureRule{refshape}(qr_order) - cellvalues = [CellValues(quadrature_rule, interpolation_space) for i in 1:Threads.nthreads()]; - facetvalues = [FacetValues(facet_quadrature_rule, interpolation_space) for i in 1:Threads.nthreads()]; - return cellvalues, facetvalues -end; - -# Create a `ScratchValues` for each thread with the thread local data -function create_scratchvalues(K, f, dh::DofHandler{dim}, ip) where {dim} - nthreads = Threads.nthreads() - assemblers = [start_assemble(K, f) for i in 1:nthreads] - cellvalues, facetvalues = create_values(ip, 2) - - n_basefuncs = getnbasefunctions(cellvalues[1]) - global_dofs = [zeros(Int, ndofs_per_cell(dh)) for i in 1:nthreads] - - fes = [zeros(n_basefuncs) for i in 1:nthreads] # Local force vector - Kes = [zeros(n_basefuncs, n_basefuncs) for i in 1:nthreads] - - ɛs = [[zero(SymmetricTensor{2, dim}) for i in 1:n_basefuncs] for i in 1:nthreads] - - coordinates = [[zero(Vec{dim}) for i in 1:length(dh.grid.cells[1].nodes)] for i in 1:nthreads] - - return [ScratchValues(Kes[i], fes[i], cellvalues[i], facetvalues[i], global_dofs[i], - ɛs[i], coordinates[i], assemblers[i]) for i in 1:nthreads] -end; - -# ## Threaded assemble + assembler::A +end -# The assembly function loops over each color and does a threaded assembly for that color -function doassemble(K::SparseMatrixCSC, colors, grid::Grid, dh::DofHandler, C::SymmetricTensor{4, dim}, ip) where {dim} +# This constructor will be called within each task to create a independent `ScratchData` +# object. For `cell_cache`, `Ke`, and `fe` we simply call the constructors to allocate +# independent objects. For `cellvalues` we use `copy` which Ferrite defines for this +# purpose. Finally, for the assembler we call `start_assemble` to create a new assembler but +# note that we set `fillzero = false` because we don't want to risk that a task that starts +# a bit later will zero out data that another task have already assembled. +function ScratchData(dh::DofHandler, K::SparseMatrixCSC, f::Vector, cellvalues::CellValues) + cell_cache = CellCache(dh) + n = ndofs_per_cell(dh) + Ke = zeros(n, n) + fe = zeros(n) + asm = start_assemble(K, f; fillzero = false) + return ScratchData(cell_cache, copy(cellvalues), Ke, fe, asm) +end +nothing # hide - f = zeros(ndofs(dh)) - scratches = create_scratchvalues(K, f, dh, ip) - b = Vec{3}((0.0, 0.0, 0.0)) # Body force +# ### Global assembly routine +# Finally we define the global assemble routine, which is where the parallelization happens. +# The main difference from all previous `assemble_global!` functions is that we now have an +# outer loop over the colors, and then the inner loop over the cells in each color, which +# can be parallelized. +# +# For the scheduling of parallel tasks we use the +# [OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl) package. OhMyThreads +# provides a macro based and a functional API. Here we use the macro based API because it is +# slightly more convenient when using task local values since they can be defined with the +# `@local` macro. +# +# !!! note "Schedulers and load balancing" +# OhMyThreads provides a number of different +# [schedulers](https://juliafolds2.github.io/OhMyThreads.jl/stable/refs/api/#Schedulers). +# In this example we use the `DynamicScheduler` (which is the default one). The +# `DynamicScheduler` will spawn `ntasks` tasks where each task will process a chunk of +# (roughly) equal number of cells (i.e. `length(color) ÷ ntasks`). This should be a good +# choice for this example because we expect all cells to take the same time to process +# and we don't need any load balancing. +# +# For a different problem setup where some cells might take longer to process (perhaps +# they experience plastic deformation and we need to solve a local problem) we might +# benefit from load balancing. The `DynamicScheduler` can be used also for load +# balancing by specifiying `nchunks` or `chunksize`. However, the `DynamicScheduler` +# will always spawn `nchunks` tasks which can become costly since we are allocating +# scratch data for every task. To limit the number of tasks, while allowing for more +# than `ntasks` chunks, we can use the `GreedyScheduler` *with chunking*. For example, +# `scheduler = OhMyThreads.GreedyScheduler(; ntasks = ntasks, nchunks = 10 * ntasks)` +# will split the work into `10 * ntasks` chunks and spawn `ntasks` tasks to process +# them. Refer to the [OhMyThreads +# documentation](https://juliafolds2.github.io/OhMyThreads.jl/stable/) for details. + +using OhMyThreads, TaskLocalValues + +function assemble_global!( + K::SparseMatrixCSC, f::Vector, dh::DofHandler, colors, + cellvalues_template::CellValues; ntasks = Threads.nthreads() + ) + ## Zero-out existing data in K and f + _ = start_assemble(K, f) + ## Body force and material stiffness + b = Vec{3}((0.0, 0.0, -1.0)) + C = create_material_stiffness() + ## Loop over the colors for color in colors - ## Each color is safe to assemble threaded - Threads.@threads :static for i in 1:length(color) - assemble_cell!(scratches[Threads.threadid()], color[i], K, grid, dh, C, b) + ## Dynamic scheduler spawning `ntasks` tasks where each task will process a chunk of + ## (roughly) equal number of cells (`length(color) ÷ ntasks`). + scheduler = OhMyThreads.DynamicScheduler(; ntasks) + ## Parallelize the loop over the cells in this color + OhMyThreads.@tasks for cellidx in color + ## Tell the @tasks loop to use the scheduler defined above + @set scheduler = scheduler + ## Obtain a task local scratch and unpack it + @local scratch = ScratchData(dh, K, f, cellvalues_template) + (; cell_cache, cellvalues, Ke, fe, assembler) = scratch + ## Reinitialize the cell cache and then the cellvalues + reinit!(cell_cache, cellidx) + reinit!(cellvalues, cell_cache) + ## Compute the local contribution of the cell + assemble_cell!(Ke, fe, cellvalues, C, b) + ## Assemble local contribution + assemble!(assembler, celldofs(cell_cache), Ke, fe) end end - return K, f end - -# The cell assembly function is written the same way as if it was a single threaded example. -# The only difference is that we unpack the variables from our `scratch`. -function assemble_cell!(scratch::ScratchValues, cell::Int, K::SparseMatrixCSC, - grid::Grid, dh::DofHandler, C::SymmetricTensor{4, dim}, b::Vec{dim}) where {dim} - - ## Unpack our stuff from the scratch - Ke, fe, cellvalues, facetvalues, global_dofs, ɛ, coordinates, assembler = - scratch.Ke, scratch.fe, scratch.cellvalues, scratch.facetvalues, - scratch.global_dofs, scratch.ɛ, scratch.coordinates, scratch.assembler - - fill!(Ke, 0) - fill!(fe, 0) - - n_basefuncs = getnbasefunctions(cellvalues) - - ## Fill up the coordinates - nodeids = grid.cells[cell].nodes - for j in 1:length(coordinates) - coordinates[j] = grid.nodes[nodeids[j]].x - end - - reinit!(cellvalues, coordinates) - - for q_point in 1:getnquadpoints(cellvalues) - for i in 1:n_basefuncs - ɛ[i] = symmetric(shape_gradient(cellvalues, q_point, i)) - end - dΩ = getdetJdV(cellvalues, q_point) - for i in 1:n_basefuncs - δu = shape_value(cellvalues, q_point, i) - fe[i] += (δu ⋅ b) * dΩ - ɛC = ɛ[i] ⊡ C - for j in 1:n_basefuncs - Ke[i, j] += (ɛC ⊡ ɛ[j]) * dΩ - end - end - end - - celldofs!(global_dofs, dh, cell) - assemble!(assembler, global_dofs, Ke, fe) -end; - -function run_assemble() - n = 20 - grid, colors = create_colored_cantilever_grid(Hexahedron, n); - ip = Lagrange{RefHexahedron,1}()^3 - dh = create_dofhandler(grid, ip); - - K = allocate_matrix(dh); - C = create_stiffness(Val{3}()); - ## compilation - doassemble(K, colors, grid, dh, C, ip); - b = @elapsed @time K, f = doassemble(K, colors, grid, dh, C, ip); - return b +nothing # hide + +# !!! details "OhMyThreads functional API: OhMyThreads.tforeach" +# The `OhMyThreads.@tasks` block above corresponds to a call to `OhMyThreads.tforeach`. +# Using the functional API directly would look like below. The main difference is that +# we need to manually create a `TaskLocalValue` for the scratch data. +# ```julia +# # using TaskLocalValues +# scratches = TaskLocalValue() do +# ScratchData(dh, K, f, cellvalues) +# end +# OhMyThreads.tforeach(color; scheduler) do cellidx +# # Obtain a task local scratch and unpack it +# scratch = scratches[] +# (; cell_cache, cellvalues, Ke, fe, assembler) = scratch +# # Reinitialize the cell cache and then the cellvalues +# reinit!(cell_cache, cellidx) +# reinit!(cellvalues, cell_cache) +# # Compute the local contribution of the cell +# assemble_cell!(Ke, fe, cellvalues, C, b) +# # Assemble local contribution +# assemble!(assembler, celldofs(cell_cache), Ke, fe) +# end +# ``` + +# We define the main function to setup everything and then time the call to +# `assemble_global!`. + +function main(; n = 20, ntasks = Threads.nthreads()) + ## Interpolation, quadrature and cellvalues + interpolation = Lagrange{RefHexahedron, 1}()^3 + quadrature = QuadratureRule{RefHexahedron}(2) + cellvalues = CellValues(quadrature, interpolation) + ## Grid, colors and DofHandler + grid, colors = create_cantilever_grid(n) + dh = create_dofhandler(grid, interpolation) + ## Global matrix and vector + K = allocate_matrix(dh) + f = zeros(ndofs(dh)) + ## Compile it + assemble_global!(K, f, dh, colors, cellvalues; ntasks = ntasks) + ## Time it + @time assemble_global!(K, f, dh, colors, cellvalues; ntasks = ntasks) + return norm(K.nzval), norm(f) #src + return end - -run_assemble() - -# Running the code with different number of threads give the following runtimes: -# * 1 thread 2.46 seconds -# * 2 threads 1.19 seconds -# * 3 threads 0.83 seconds -# * 4 threads 0.75 seconds +nothing # hide + +# On a machine with 4 cores, starting julia with `--threads=auto`, we obtain the following +# timings: +# ```julia +# main(; ntasks = 1) # 1.970784 seconds (902 allocations: 816.172 KiB) +# main(; ntasks = 2) # 1.025065 seconds (1.64 k allocations: 1.564 MiB) +# main(; ntasks = 3) # 0.700423 seconds (2.38 k allocations: 2.332 MiB) +# main(; ntasks = 4) # 0.548356 seconds (3.12 k allocations: 3.099 MiB) +# ``` + +using Test #src +nK1, nf1 = main(; n = 5, ntasks = 1) #src +nK2, nf2 = main(; n = 5, ntasks = 2) #src +nK4, nf4 = main(; n = 5, ntasks = 4) #src +@test nK1 == nK2 == nK4 #src +@test nf1 == nf2 == nf4 #src #md # ## [Plain program](@id threaded_assembly-plain-program) #md # diff --git a/test/test_examples.jl b/test/test_examples.jl index d20001d18f..dca1ed73cf 100644 --- a/test/test_examples.jl +++ b/test/test_examples.jl @@ -61,3 +61,14 @@ if !Sys.iswindows() end end end + +module TestMultiThreading + mktempdir() do dir + cd(dir) do + # Silence the @time output + redirect_stdout(devnull) do + include(joinpath(@__DIR__, "../docs/src/literate-howto/threaded_assembly.jl")) + end + end + end +end