From 0d86738be070ac5e0b7ec1d9d40dbf3f37165619 Mon Sep 17 00:00:00 2001
From: Daniel Seemaier <daniel@seemaier.de>
Date: Wed, 30 Oct 2024 13:43:49 +0100
Subject: [PATCH 1/6] refactor(dist): remove OpenMP dependency in sparse
 alltoall

---
 kaminpar-dist/graphutils/communication.h | 312 ++++++++++++-----------
 1 file changed, 158 insertions(+), 154 deletions(-)
diff --git a/kaminpar-dist/graphutils/communication.h b/kaminpar-dist/graphutils/communication.h
index 7e78e4b8..445fbf30 100644
--- a/kaminpar-dist/graphutils/communication.h
+++ b/kaminpar-dist/graphutils/communication.h
@@ -9,7 +9,7 @@
 
 #include <type_traits>
 
-#include <omp.h>
+#include <tbb/task_arena.h>
 
 #include "kaminpar-mpi/sparse_alltoall.h"
 #include "kaminpar-mpi/utils.h"
@@ -21,6 +21,7 @@
 #include "kaminpar-common/datastructures/marker.h"
 #include "kaminpar-common/datastructures/noinit_vector.h"
 #include "kaminpar-common/logger.h"
+#include "kaminpar-common/parallel/loops.h"
 
 #define SPARSE_ALLTOALL_NOFILTER                                                                   \
   [](NodeID) {                                                                                     \
@@ -28,10 +29,12 @@
   }
 
 namespace kaminpar::mpi::graph {
+
 using namespace kaminpar::dist;
 SET_DEBUG(false);
 
 namespace internal {
+
 template <typename Data> void inclusive_col_prefix_sum(Data &data) {
   if (data.empty()) {
     return;
@@ -46,6 +49,7 @@ template <typename Data> void inclusive_col_prefix_sum(Data &data) {
     }
   }
 }
+
 } // namespace internal
 
 /**
@@ -144,40 +148,41 @@ void sparse_alltoall_interface_to_ghost_custom_range(
 
   const auto [size, rank] = mpi::get_comm_info(graph.communicator());
 
-  // START_TIMER("Message construction");
-
   // Allocate message counters
-  const PEID num_threads = omp_get_max_threads();
+  const PEID num_threads = tbb::this_task_arena::max_concurrency();
   std::vector<CacheAlignedVector<std::size_t>> num_messages(
       num_threads, CacheAlignedVector<std::size_t>(size)
   );
 
   // Count messages to each PE for each thread
-#pragma omp parallel for default(none) shared(graph, from, to, mapper, num_messages, filter)
-  for (NodeID seq_u = from; seq_u < to; ++seq_u) {
-    const NodeID u = mapper(seq_u);
-
-    if constexpr (filter_invocable_with_node) {
-      if (!filter(u)) {
-        continue;
-      }
-    }
-
-    const PEID thread = omp_get_thread_num();
-
-    graph.neighbors(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
-      if (graph.is_ghost_node(v)) {
-        if constexpr (filter_invocable_with_edge) {
-          if (!filter(u, e, v, w)) {
-            return;
+  parallel::deterministic_for<NodeID>(
+      from,
+      to,
+      [&](const NodeID range_from, const NodeID range_to, const PEID thread) {
+        for (NodeID seq_u = range_from; seq_u < range_to; ++seq_u) {
+          const NodeID u = mapper(seq_u);
+
+          if constexpr (filter_invocable_with_node) {
+            if (!filter(u)) {
+              continue;
+            }
           }
-        }
 
-        const PEID owner = graph.ghost_owner(v);
-        ++num_messages[thread][owner];
+          graph.neighbors(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
+            if (graph.is_ghost_node(v)) {
+              if constexpr (filter_invocable_with_edge) {
+                if (!filter(u, e, v, w)) {
+                  return;
+                }
+              }
+
+              const PEID owner = graph.ghost_owner(v);
+              ++num_messages[thread][owner];
+            }
+          });
+        }
       }
-    });
-  }
+  );
 
   // Offset messages for each thread
   internal::inclusive_col_prefix_sum(num_messages);
@@ -188,38 +193,39 @@ void sparse_alltoall_interface_to_ghost_custom_range(
     send_buffers[pe].resize(num_messages.back()[pe]);
   });
 
-#pragma omp parallel for default(none)                                                             \
-    shared(send_buffers, from, to, mapper, filter, graph, builder, num_messages)
-  for (NodeID seq_u = from; seq_u < to; ++seq_u) {
-    const NodeID u = mapper(seq_u);
-
-    if constexpr (filter_invocable_with_node) {
-      if (!filter(u)) {
-        continue;
-      }
-    }
-
-    const PEID thread = omp_get_thread_num();
-    graph.neighbors(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
-      if (graph.is_ghost_node(v)) {
-        if constexpr (filter_invocable_with_edge) {
-          if (!filter(u, e, v, w)) {
-            return;
+  parallel::deterministic_for<NodeID>(
+      from,
+      to,
+      [&](const NodeID range_from, const NodeID range_to, const PEID thread) {
+        for (NodeID seq_u = range_from; seq_u < range_to; ++seq_u) {
+          const NodeID u = mapper(seq_u);
+
+          if constexpr (filter_invocable_with_node) {
+            if (!filter(u)) {
+              continue;
+            }
           }
-        }
 
-        const PEID pe = graph.ghost_owner(v);
-        const std::size_t slot = --num_messages[thread][pe];
-        if constexpr (builder_invocable_with_pe) {
-          send_buffers[pe][slot] = builder(u, e, v, w, pe);
-        } else /* if (builder_invocable_without_pe) */ {
-          send_buffers[pe][slot] = builder(u, e, v, w);
+          graph.neighbors(u, [&](const EdgeID e, const NodeID v, const EdgeWeight w) {
+            if (graph.is_ghost_node(v)) {
+              if constexpr (filter_invocable_with_edge) {
+                if (!filter(u, e, v, w)) {
+                  return;
+                }
+              }
+
+              const PEID pe = graph.ghost_owner(v);
+              const std::size_t slot = --num_messages[thread][pe];
+              if constexpr (builder_invocable_with_pe) {
+                send_buffers[pe][slot] = builder(u, e, v, w, pe);
+              } else /* if (builder_invocable_without_pe) */ {
+                send_buffers[pe][slot] = builder(u, e, v, w);
+              }
+            }
+          });
         }
       }
-    });
-  }
-
-  // STOP_TIMER();
+  );
 
   sparse_alltoall<Message, Buffer>(
       std::move(send_buffers), std::forward<decltype(receiver)>(receiver), graph.communicator()
@@ -470,48 +476,49 @@ void sparse_alltoall_interface_to_pe_custom_range(
   // START_TIMER("Message construction");
 
   // Allocate message counters
-  const PEID num_threads = omp_get_max_threads();
+  const PEID num_threads = tbb::this_task_arena::max_concurrency();
   std::vector<CacheAlignedVector<std::size_t>> num_messages(
       num_threads, CacheAlignedVector<std::size_t>(size)
   );
 
-#pragma omp parallel default(none) shared(size, from, to, mapper, filter, graph, num_messages)
-  {
-    Marker<> created_message_for_pe(static_cast<std::size_t>(size));
-    const PEID thread = omp_get_thread_num();
+  parallel::deterministic_for<NodeID>(
+      from,
+      to,
+      [&](const NodeID range_from, const NodeID range_to, const PEID thread) {
+        Marker<> created_message_for_pe(static_cast<std::size_t>(size));
+
+        for (NodeID seq_u = range_from; seq_u < range_to; ++seq_u) {
+          const NodeID u = mapper(seq_u);
+
+          if constexpr (filter_invocable_with_unmapped_node) {
+            if (!filter(seq_u, u)) {
+              continue;
+            }
+          } else {
+            if (!filter(u)) {
+              continue;
+            }
+          }
 
-#pragma omp for
-    for (NodeID seq_u = from; seq_u < to; ++seq_u) {
-      const NodeID u = mapper(seq_u);
+          graph.adjacent_nodes(u, [&](const NodeID v) {
+            if (!graph.is_ghost_node(v)) {
+              return;
+            }
 
-      if constexpr (filter_invocable_with_unmapped_node) {
-        if (!filter(seq_u, u)) {
-          continue;
-        }
-      } else {
-        if (!filter(u)) {
-          continue;
-        }
-      }
+            const PEID pe = graph.ghost_owner(v);
 
-      graph.adjacent_nodes(u, [&](const NodeID v) {
-        if (!graph.is_ghost_node(v)) {
-          return;
-        }
+            if (created_message_for_pe.get(pe)) {
+              return;
+            }
+            created_message_for_pe.set(pe);
 
-        const PEID pe = graph.ghost_owner(v);
+            ++num_messages[thread][pe];
+          });
 
-        if (created_message_for_pe.get(pe)) {
-          return;
+          created_message_for_pe.reset();
         }
-        created_message_for_pe.set(pe);
-
-        ++num_messages[thread][pe];
-      });
-
-      created_message_for_pe.reset();
-    }
-  }
+      }
+  );
 
   // Offset messages for each thread
   internal::inclusive_col_prefix_sum(num_messages);
@@ -522,55 +529,52 @@ void sparse_alltoall_interface_to_pe_custom_range(
     send_buffers[pe].resize(num_messages.back()[pe]);
   });
 
-  // Fill buffers
-#pragma omp parallel default(none)                                                                 \
-    shared(send_buffers, size, from, to, mapper, builder, filter, graph, num_messages)
-  {
-    Marker<> created_message_for_pe(static_cast<std::size_t>(size));
-    const PEID thread = omp_get_thread_num();
-
-#pragma omp for
-    for (NodeID seq_u = from; seq_u < to; ++seq_u) {
-      const NodeID u = mapper(seq_u);
-
-      if constexpr (filter_invocable_with_unmapped_node) {
-        if (!filter(seq_u, u)) {
-          continue;
-        }
-      } else {
-        if (!filter(u)) {
-          continue;
-        }
-      }
-
-      graph.adjacent_nodes(u, [&](const NodeID v) {
-        if (!graph.is_ghost_node(v)) {
-          return;
-        }
+  parallel::deterministic_for<NodeID>(
+      from,
+      to,
+      [&](const NodeID range_from, const NodeID range_to, const PEID thread) {
+        Marker<> created_message_for_pe(static_cast<std::size_t>(size));
+
+        for (NodeID seq_u = range_from; seq_u < range_to; ++seq_u) {
+          const NodeID u = mapper(seq_u);
+
+          if constexpr (filter_invocable_with_unmapped_node) {
+            if (!filter(seq_u, u)) {
+              continue;
+            }
+          } else {
+            if (!filter(u)) {
+              continue;
+            }
+          }
 
-        const PEID pe = graph.ghost_owner(v);
+          graph.adjacent_nodes(u, [&](const NodeID v) {
+            if (!graph.is_ghost_node(v)) {
+              return;
+            }
 
-        if (created_message_for_pe.get(pe)) {
-          return;
-        }
-        created_message_for_pe.set(pe);
+            const PEID pe = graph.ghost_owner(v);
 
-        const auto slot = --num_messages[thread][pe];
+            if (created_message_for_pe.get(pe)) {
+              return;
+            }
+            created_message_for_pe.set(pe);
 
-        if constexpr (builder_invocable_with_pe) {
-          send_buffers[pe][slot] = builder(u, pe);
-        } else if constexpr (builder_invocable_with_pe_and_unmapped_node) {
-          send_buffers[pe][slot] = builder(seq_u, u, pe);
-        } else {
-          send_buffers[pe][slot] = builder(u);
-        }
-      });
+            const auto slot = --num_messages[thread][pe];
 
-      created_message_for_pe.reset();
-    }
-  }
+            if constexpr (builder_invocable_with_pe) {
+              send_buffers[pe][slot] = builder(u, pe);
+            } else if constexpr (builder_invocable_with_pe_and_unmapped_node) {
+              send_buffers[pe][slot] = builder(seq_u, u, pe);
+            } else {
+              send_buffers[pe][slot] = builder(u);
+            }
+          });
 
-  // STOP_TIMER();
+          created_message_for_pe.reset();
+        }
+      }
+  );
 
   sparse_alltoall<Message, Buffer>(
       std::move(send_buffers), std::forward<Receiver>(receiver), graph.communicator()
@@ -752,22 +756,23 @@ void sparse_alltoall_custom(
   // START_TIMER("Message construction");
 
   // Allocate message counters
-  const PEID num_threads = omp_get_max_threads();
+  const PEID num_threads = tbb::this_task_arena::max_concurrency();
   std::vector<CacheAlignedVector<std::size_t>> num_messages(
       num_threads, CacheAlignedVector<std::size_t>(size)
   );
 
   // Count messages to each PE for each thread
-#pragma omp parallel default(none) shared(pe_getter, size, from, to, filter, graph, num_messages)
-  {
-    const PEID thread = omp_get_thread_num();
-#pragma omp for
-    for (NodeID u = from; u < to; ++u) {
-      if (filter(u)) {
-        ++num_messages[thread][pe_getter(u)];
+  parallel::deterministic_for<NodeID>(
+      from,
+      to,
+      [&](const NodeID range_from, const NodeID range_to, const PEID thread) {
+        for (NodeID u = range_from; u < range_to; ++u) {
+          if (filter(u)) {
+            ++num_messages[thread][pe_getter(u)];
+          }
+        }
       }
-    }
-  }
+  );
 
   // Offset messages for each thread
   internal::inclusive_col_prefix_sum(num_messages);
@@ -778,22 +783,20 @@ void sparse_alltoall_custom(
     send_buffers[pe].resize(num_messages.back()[pe]);
   });
 
-  // fill buffers
-#pragma omp parallel default(none)                                                                 \
-    shared(pe_getter, send_buffers, size, from, to, builder, filter, graph, num_messages)
-  {
-    const PEID thread = omp_get_thread_num();
-#pragma omp for
-    for (NodeID u = from; u < to; ++u) {
-      if (filter(u)) {
-        const PEID pe = pe_getter(u);
-        const auto slot = --num_messages[thread][pe];
-        send_buffers[pe][slot] = builder(u);
+  // Fill buffers
+  parallel::deterministic_for<NodeID>(
+      from,
+      to,
+      [&](const NodeID range_from, const NodeID range_to, const PEID thread) {
+        for (NodeID u = range_from; u < range_to; ++u) {
+          if (filter(u)) {
+            const PEID pe = pe_getter(u);
+            const auto slot = --num_messages[thread][pe];
+            send_buffers[pe][slot] = builder(u);
+          }
+        }
       }
-    }
-  }
-
-  // STOP_TIMER();
+  );
 
   sparse_alltoall<Message, Buffer>(
       std::move(send_buffers), std::forward<Receiver>(receiver), graph.communicator()
@@ -828,4 +831,5 @@ std::vector<Buffer> sparse_alltoall_custom(
   );
   return recv_buffers;
 }
+
 } // namespace kaminpar::mpi::graph

From 71c49d8325ce8ade821819523858a3fc85a22898 Mon Sep 17 00:00:00 2001
From: Daniel Seemaier <daniel@seemaier.de>
Date: Wed, 30 Oct 2024 13:52:12 +0100
Subject: [PATCH 2/6] fix(dist): crash in distributed contraction code when
 executing in hybrid mode due to non-parallelized section

---
 .../coarsening/contraction/global_cluster_contraction.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kaminpar-dist/coarsening/contraction/global_cluster_contraction.cc b/kaminpar-dist/coarsening/contraction/global_cluster_contraction.cc
index b4f0182d..556f040a 100644
--- a/kaminpar-dist/coarsening/contraction/global_cluster_contraction.cc
+++ b/kaminpar-dist/coarsening/contraction/global_cluster_contraction.cc
@@ -311,11 +311,13 @@ find_nonlocal_nodes(const Graph &graph, const StaticArray<GlobalNodeID> &lnode_t
   std::unordered_map<GlobalNodeID, NodeWeight> nonlocal_nodes;
   std::atomic<std::size_t> size = 0;
 
-  graph.pfor_all_nodes([&](const NodeID lnode) {
+  for (NodeID lnode : graph.all_nodes()) {
+    // graph.pfor_all_nodes([&](const NodeID lnode) {
     const GlobalNodeID gcluster = lnode_to_gcluster[lnode];
 
     if (graph.is_owned_global_node(gcluster)) {
-      return;
+      // return;
+      continue;
     }
 
     const NodeWeight weight = graph.is_owned_node(lnode) ? graph.node_weight(lnode) : 0;
@@ -333,7 +335,8 @@ find_nonlocal_nodes(const Graph &graph, const StaticArray<GlobalNodeID> &lnode_t
       size.fetch_add(1, std::memory_order_relaxed);
       nonlocal_nodes[gcluster + 1] = weight;
     }
-  });
+    //});
+  }
 
   RECORD("nonlocal_nodes") StaticArray<GlobalNode> dense_nonlocal_nodes(size);
   std::size_t i = 0;

From 288ed7ee9e7e07d96033e3756d11741a2a1e2c46 Mon Sep 17 00:00:00 2001
From: Daniel Seemaier <daniel@seemaier.de>
Date: Wed, 30 Oct 2024 14:03:53 +0100
Subject: [PATCH 3/6] tests(dist): add hybryd endtoend test

---
 tests/endtoend/dist_endtoend_test.cc | 33 ++++++++--------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/tests/endtoend/dist_endtoend_test.cc b/tests/endtoend/dist_endtoend_test.cc
index 8b6670a4..1bceebfd 100644
--- a/tests/endtoend/dist_endtoend_test.cc
+++ b/tests/endtoend/dist_endtoend_test.cc
@@ -144,17 +144,14 @@ TEST(DistEndToEndTest, partitions_unweighted_walshaw_data_graph) {
   EXPECT_EQ(reported_cut, actual_cut / 2);
 }
 
-// Disabled: can fail since we offset the PRNG seed by the thread ID, and not all calls are made by
-// the same threads across multiple runs
-/*
-TEST(DistEndToEndTest, partitions_unweighted_walshaw_data_graph_multiple_times_with_same_seed) {
-  const PEID size = mpi::get_comm_size(MPI_COMM_WORLD);
+TEST(
+    DistEndToEndTest, partitions_unweighted_walshaw_data_graph_multiple_times_with_different_seeds
+) {
   const PEID rank = mpi::get_comm_rank(MPI_COMM_WORLD);
 
   auto vtxdist = data::create_vtxdist();
   auto xadj = data::create_xadj();
 
-  const GlobalNodeID global_n = data::global_xadj.size() - 1;
   const NodeID n = xadj.size() - 1;
 
   GlobalNodeID *vtxdist_ptr = vtxdist.data();
@@ -166,23 +163,20 @@ TEST(DistEndToEndTest, partitions_unweighted_walshaw_data_graph_multiple_times_w
   dKaMinPar dist(MPI_COMM_WORLD, 1, create_default_context()); // 1 thread: deterministic
   dist.set_output_level(OutputLevel::QUIET);
   dist.import_graph(vtxdist_ptr, xadj_ptr, adjncy_ptr, nullptr, nullptr);
-  const EdgeWeight reported_cut = dist.compute_partition(16, seed0_partition.data());
+  dist.compute_partition(16, seed0_partition.data());
 
-  for (const int seed : {0, 0, 0}) {
+  for (const int seed : {1, 2, 3}) {
     std::vector<BlockID> partition(n);
     dKaMinPar::reseed(seed);
     dKaMinPar dist(MPI_COMM_WORLD, 1, create_default_context()); // 1 thread: deterministic
     dist.set_output_level(OutputLevel::QUIET);
     dist.import_graph(vtxdist_ptr, xadj_ptr, adjncy_ptr, nullptr, nullptr);
     dist.compute_partition(16, partition.data());
-    EXPECT_EQ(partition, seed0_partition);
+    EXPECT_NE(partition, seed0_partition);
   }
 }
-*/
 
-TEST(
-    DistEndToEndTest, partitions_unweighted_walshaw_data_graph_multiple_times_with_different_seeds
-) {
+TEST(DistEndToEndTest, partitions_unweighted_walshaw_data_graph_with_three_threads_per_mpi) {
   const PEID rank = mpi::get_comm_rank(MPI_COMM_WORLD);
 
   auto vtxdist = data::create_vtxdist();
@@ -196,18 +190,9 @@ TEST(
 
   std::vector<BlockID> seed0_partition(n);
   dKaMinPar::reseed(0);
-  dKaMinPar dist(MPI_COMM_WORLD, 1, create_default_context()); // 1 thread: deterministic
+  dKaMinPar dist(MPI_COMM_WORLD, 3, create_default_context());
   dist.set_output_level(OutputLevel::QUIET);
   dist.import_graph(vtxdist_ptr, xadj_ptr, adjncy_ptr, nullptr, nullptr);
-
-  for (const int seed : {1, 2, 3}) {
-    std::vector<BlockID> partition(n);
-    dKaMinPar::reseed(seed);
-    dKaMinPar dist(MPI_COMM_WORLD, 1, create_default_context()); // 1 thread: deterministic
-    dist.set_output_level(OutputLevel::QUIET);
-    dist.import_graph(vtxdist_ptr, xadj_ptr, adjncy_ptr, nullptr, nullptr);
-    dist.compute_partition(16, partition.data());
-    EXPECT_NE(partition, seed0_partition);
-  }
+  dist.compute_partition(16, seed0_partition.data());
 }
 } // namespace kaminpar::dist

From e5c53fff6ae242fc922a2a695304b1a5547e056f Mon Sep 17 00:00:00 2001
From: Daniel Seemaier <daniel@seemaier.de>
Date: Wed, 30 Oct 2024 14:05:00 +0100
Subject: [PATCH 4/6] style(tests): add empty lines after namespace

---
 tests/endtoend/dist_endtoend_test.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/endtoend/dist_endtoend_test.cc b/tests/endtoend/dist_endtoend_test.cc
index 1bceebfd..9f7cfb23 100644
--- a/tests/endtoend/dist_endtoend_test.cc
+++ b/tests/endtoend/dist_endtoend_test.cc
@@ -17,7 +17,9 @@
 #include "kaminpar-common/math.h"
 
 namespace kaminpar::dist {
+
 namespace data {
+
 static std::vector<GlobalEdgeID> global_xadj = {
 #include "data.graph.xadj"
 };
@@ -54,6 +56,7 @@ std::vector<GlobalEdgeID> create_xadj() {
 
   return xadj;
 }
+
 } // namespace data
 
 TEST(DistEndToEndTest, partitions_empty_unweighted_graph) {
@@ -195,4 +198,5 @@ TEST(DistEndToEndTest, partitions_unweighted_walshaw_data_graph_with_three_threa
   dist.import_graph(vtxdist_ptr, xadj_ptr, adjncy_ptr, nullptr, nullptr);
   dist.compute_partition(16, seed0_partition.data());
 }
+
 } // namespace kaminpar::dist

From 7142ff424e9ef51e3d51ba401c294cb6e83ff9ae Mon Sep 17 00:00:00 2001
From: Daniel Seemaier <daniel@seemaier.de>
Date: Wed, 30 Oct 2024 14:11:58 +0100
Subject: [PATCH 5/6] refactor(dist): remove remaining OpenMP dependencies

---
 apps/benchmarks/dist_block_clustering_benchmark.cc |  2 --
 apps/benchmarks/dist_coarsening_benchmark.cc       |  2 --
 apps/benchmarks/dist_coloring_benchmark.cc         |  2 --
 apps/benchmarks/dist_contraction_benchmark.cc      |  2 --
 apps/benchmarks/dist_refinement_benchmark.cc       |  2 --
 kaminpar-dist/datastructures/growt.h               | 12 ++++++------
 kaminpar-dist/dkaminpar.cc                         |  2 --
 kaminpar-mpi/CMakeLists.txt                        |  3 +--
 8 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/apps/benchmarks/dist_block_clustering_benchmark.cc b/apps/benchmarks/dist_block_clustering_benchmark.cc
index bffda636..9e399c3c 100644
--- a/apps/benchmarks/dist_block_clustering_benchmark.cc
+++ b/apps/benchmarks/dist_block_clustering_benchmark.cc
@@ -12,7 +12,6 @@
 #include <fstream>
 
 #include <mpi.h>
-#include <omp.h>
 
 #include "kaminpar-dist/context.h"
 #include "kaminpar-dist/context_io.h"
@@ -56,7 +55,6 @@ int main(int argc, char *argv[]) {
   CLI11_PARSE(app, argc, argv);
 
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads);
-  omp_set_num_threads(ctx.parallel.num_threads);
 
   auto wrapper = load_partitioned_graph(graph_filename, partition_filename);
   auto &graph = *wrapper.graph;
diff --git a/apps/benchmarks/dist_coarsening_benchmark.cc b/apps/benchmarks/dist_coarsening_benchmark.cc
index 9a175253..e1f19e81 100644
--- a/apps/benchmarks/dist_coarsening_benchmark.cc
+++ b/apps/benchmarks/dist_coarsening_benchmark.cc
@@ -10,7 +10,6 @@
 // clang-format on
 
 #include <mpi.h>
-#include <omp.h>
 
 #include "kaminpar-dist/coarsening/coarsener.h"
 #include "kaminpar-dist/context.h"
@@ -49,7 +48,6 @@ int main(int argc, char *argv[]) {
   CLI11_PARSE(app, argc, argv);
 
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads);
-  omp_set_num_threads(ctx.parallel.num_threads);
 
   auto wrapper = load_graph(graph_filename);
   auto &graph = *wrapper.graph;
diff --git a/apps/benchmarks/dist_coloring_benchmark.cc b/apps/benchmarks/dist_coloring_benchmark.cc
index 8ac731c3..e47b826f 100644
--- a/apps/benchmarks/dist_coloring_benchmark.cc
+++ b/apps/benchmarks/dist_coloring_benchmark.cc
@@ -10,7 +10,6 @@
 // clang-format on
 
 #include <mpi.h>
-#include <omp.h>
 #include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
 
@@ -38,7 +37,6 @@ int main(int argc, char *argv[]) {
   CLI11_PARSE(app, argc, argv);
 
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads);
-  omp_set_num_threads(ctx.parallel.num_threads);
 
   auto wrapper = load_graph(graph_filename);
   auto &graph = *wrapper.graph;
diff --git a/apps/benchmarks/dist_contraction_benchmark.cc b/apps/benchmarks/dist_contraction_benchmark.cc
index 1ba08532..9a4776ae 100644
--- a/apps/benchmarks/dist_contraction_benchmark.cc
+++ b/apps/benchmarks/dist_contraction_benchmark.cc
@@ -10,7 +10,6 @@
 // clang-format on
 
 #include <mpi.h>
-#include <omp.h>
 
 #include "kaminpar-dist/coarsening/contraction/global_cluster_contraction.h"
 #include "kaminpar-dist/context.h"
@@ -42,7 +41,6 @@ int main(int argc, char *argv[]) {
   CLI11_PARSE(app, argc, argv);
 
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads);
-  omp_set_num_threads(ctx.parallel.num_threads);
 
   auto wrapper = load_graph(graph_filename);
   auto &graph = *wrapper.graph;
diff --git a/apps/benchmarks/dist_refinement_benchmark.cc b/apps/benchmarks/dist_refinement_benchmark.cc
index 993bc3e9..0eca225e 100644
--- a/apps/benchmarks/dist_refinement_benchmark.cc
+++ b/apps/benchmarks/dist_refinement_benchmark.cc
@@ -12,7 +12,6 @@
 #include <fstream>
 
 #include <mpi.h>
-#include <omp.h>
 
 #include "kaminpar-dist/context.h"
 #include "kaminpar-dist/context_io.h"
@@ -62,7 +61,6 @@ int main(int argc, char *argv[]) {
   }
 
   tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads);
-  omp_set_num_threads(ctx.parallel.num_threads);
 
   auto wrapper = load_partitioned_graph(graph_filename, partition_filename);
   auto &graph = *wrapper.graph;
diff --git a/kaminpar-dist/datastructures/growt.h b/kaminpar-dist/datastructures/growt.h
index 069625c2..e504fb8e 100644
--- a/kaminpar-dist/datastructures/growt.h
+++ b/kaminpar-dist/datastructures/growt.h
@@ -10,6 +10,8 @@
 #include <type_traits>
 
 #include <tbb/enumerable_thread_specific.h>
+#include <tbb/parallel_for.h>
+#include <tbb/task_arena.h>
 
 #include "kaminpar-dist/dkaminpar.h"
 
@@ -59,8 +61,7 @@ using StaticGhostNodeMapping = typename ::growt::
 template <typename Map, typename Lambda> void pfor_map(Map &map, Lambda &&lambda) {
   std::atomic_size_t counter = 0;
 
-#pragma omp parallel default(none) shared(map, counter, lambda)
-  {
+  tbb::parallel_for<int>(0, tbb::this_task_arena::max_concurrency(), [&](const int) {
     const std::size_t capacity = map.capacity();
     std::size_t cur_block = counter.fetch_add(4096);
 
@@ -71,14 +72,13 @@ template <typename Map, typename Lambda> void pfor_map(Map &map, Lambda &&lambda
       }
       cur_block = counter.fetch_add(4096);
     }
-  }
+  });
 }
 
 template <typename Handles, typename Lambda> void pfor_handles(Handles &handles, Lambda &&lambda) {
   std::atomic_size_t counter = 0;
 
-#pragma omp parallel default(none) shared(handles, counter, lambda)
-  {
+  tbb::parallel_for<int>(0, tbb::this_task_arena::max_concurrency(), [&](const int) {
     auto &handle = handles.local();
     const std::size_t capacity = handle.capacity();
     std::size_t cur_block = counter.fetch_add(4096);
@@ -90,6 +90,6 @@ template <typename Handles, typename Lambda> void pfor_handles(Handles &handles,
       }
       cur_block = counter.fetch_add(4096);
     }
-  }
+  });
 }
 } // namespace kaminpar::dist::growt
diff --git a/kaminpar-dist/dkaminpar.cc b/kaminpar-dist/dkaminpar.cc
index 965bafe4..5aa967b1 100644
--- a/kaminpar-dist/dkaminpar.cc
+++ b/kaminpar-dist/dkaminpar.cc
@@ -11,7 +11,6 @@
 #include <utility>
 
 #include <mpi.h>
-#include <omp.h>
 #include <tbb/global_control.h>
 #include <tbb/parallel_invoke.h>
 
@@ -157,7 +156,6 @@ dKaMinPar::dKaMinPar(MPI_Comm comm, const int num_threads, const Context ctx)
       _num_threads(num_threads),
       _ctx(ctx),
       _gc(tbb::global_control::max_allowed_parallelism, num_threads) {
-  omp_set_num_threads(num_threads);
 #ifdef KAMINPAR_ENABLE_TIMERS
   GLOBAL_TIMER.reset();
 #endif // KAMINPAR_ENABLE_TIMERS
diff --git a/kaminpar-mpi/CMakeLists.txt b/kaminpar-mpi/CMakeLists.txt
index 8ad648aa..d3c782dd 100644
--- a/kaminpar-mpi/CMakeLists.txt
+++ b/kaminpar-mpi/CMakeLists.txt
@@ -8,5 +8,4 @@ target_compile_options(kaminpar_mpi PRIVATE ${KAMINPAR_WARNING_FLAGS})
 
 find_library(NUMA_LIB numa) # optional
 
-find_package(OpenMP REQUIRED)
-target_link_libraries(kaminpar_mpi PUBLIC kaminpar_common MPI::MPI_CXX OpenMP::OpenMP_CXX)
+target_link_libraries(kaminpar_mpi PUBLIC kaminpar_common MPI::MPI_CXX)

From 6940b69f306eb32b5b1a6db1fa1ca8730c61fdb7 Mon Sep 17 00:00:00 2001
From: Daniel Seemaier <daniel@seemaier.de>
Date: Wed, 30 Oct 2024 14:14:05 +0100
Subject: [PATCH 6/6] ci: no longer install OpenMP

---
 .github/workflows/main.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index aff26c47..dd92f39e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -19,7 +19,7 @@ jobs:
         build-mode: [Release]
     steps:
       - name: Install dependencies
-        run: brew install llvm libomp tbb open-mpi google-sparsehash
+        run: brew install llvm tbb open-mpi google-sparsehash
       - name: Checkout HEAD
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
@@ -63,7 +63,7 @@ jobs:
         build-mode: [Release]
     steps:
       - name: Install dependencies
-        run: sudo apt-get install -y libtbb-dev libhwloc-dev mpi libopenmpi-dev libomp-dev libsparsehash-dev
+        run: sudo apt-get install -y libtbb-dev libhwloc-dev mpi libopenmpi-dev libsparsehash-dev
       - name: Checkout HEAD
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with: