[KaMinPar] Rework settings for max core usage during deep initial par…

…titioning
KaHIP · Sep 18, 2023 · f157df3 · f157df3
1 parent 49f59a2
commit f157df3
Show file tree

Hide file tree

Showing 9 changed files with 61 additions and 33 deletions.
diff --git a/kaminpar/context_io.cc b/kaminpar/context_io.cc
@@ -164,7 +164,6 @@ void print(const LabelPropagationCoarseningContext &lp_ctx, std::ostream &out) {
 }
 
 void print(const InitialPartitioningContext &i_ctx, std::ostream &out) {
-  out << "Initial partitioning mode:    " << i_ctx.mode << "\n";
   out << "Adaptive algorithm selection: "
       << (i_ctx.use_adaptive_bipartitioner_selection ? "yes" : "no") << "\n";
 }
@@ -206,12 +205,20 @@ void print(const PartitionContext &p_ctx, std::ostream &out) {
       << p_ctx.block_weights.perfectly_balanced(0) << " + " << 100 * p_ctx.epsilon << "%)\n";
 }
 
+void print(const PartitioningContext &p_ctx, std::ostream &out) {
+  out << "Partitioning mode:            " << p_ctx.mode << "\n";
+  if (p_ctx.mode == PartitioningMode::DEEP) {
+    out << "  Deep initial part. mode:  " << p_ctx.deep_initial_partitioning_mode << "\n";
+    out << "  Deep initial part. load:  " << p_ctx.deep_initial_partitioning_load << "\n";
+  }
+}
+
 void print(const Context &ctx, std::ostream &out) {
   out << "Execution mode:               " << ctx.parallel.num_threads << "\n";
   out << "Graph:                        " << ctx.debug.graph_name << "\n";
   print(ctx.partition, out);
   cio::print_delimiter("Partitioning Scheme", '-');
-  out << "Partitioning mode:            " << ctx.mode << "\n";
+  print(ctx.partitioning, out);
   cio::print_delimiter("Coarsening", '-');
   print(ctx.coarsening, out);
   cio::print_delimiter("Initial Partitioning", '-');

diff --git a/kaminpar/context_io.h b/kaminpar/context_io.h
@@ -38,6 +38,7 @@ std::ostream &operator<<(std::ostream &out, InitialPartitioningMode mode);
 std::unordered_map<std::string, InitialPartitioningMode> get_initial_partitioning_modes();
 
 void print(const Context &ctx, std::ostream &out);
+void print(const PartitioningContext &p_ctx, std::ostream &out);
 void print(const PartitionContext &p_ctx, std::ostream &out);
 void print(const RefinementContext &r_ctx, std::ostream &out);
 void print(const CoarseningContext &c_ctx, std::ostream &out);

diff --git a/kaminpar/factories.cc b/kaminpar/factories.cc
@@ -31,7 +31,7 @@
 
 namespace kaminpar::shm::factory {
 std::unique_ptr<Partitioner> create_partitioner(const Graph &graph, const Context &ctx) {
-  switch (ctx.mode) {
+  switch (ctx.partitioning.mode) {
   case PartitioningMode::DEEP: {
     return std::make_unique<DeepMultilevelPartitioner>(graph, ctx);
   }

diff --git a/kaminpar/kaminpar.h b/kaminpar/kaminpar.h
@@ -192,8 +192,6 @@ struct InitialRefinementContext {
 };
 
 struct InitialPartitioningContext {
-  InitialPartitioningMode mode;
-
   InitialCoarseningContext coarsening;
   InitialRefinementContext refinement;
 
@@ -203,7 +201,6 @@ struct InitialPartitioningContext {
   std::size_t max_num_repetitions;
   std::size_t num_seed_iterations;
   bool use_adaptive_bipartitioner_selection;
-  std::size_t multiplier_exponent;
 };
 
 //
@@ -260,9 +257,15 @@ enum class PartitioningMode {
   RB,
 };
 
-struct Context {
+struct PartitioningContext {
   PartitioningMode mode;
 
+  InitialPartitioningMode deep_initial_partitioning_mode;
+  double deep_initial_partitioning_load;
+};
+
+struct Context {
+  PartitioningContext partitioning;
   PartitionContext partition;
   CoarseningContext coarsening;
   InitialPartitioningContext initial_partitioning;

diff --git a/kaminpar/partitioning/deep/deep_multilevel.cc b/kaminpar/partitioning/deep/deep_multilevel.cc
@@ -135,7 +135,7 @@ const Graph *DeepMultilevelPartitioner::coarsen() {
 }
 
 NodeID DeepMultilevelPartitioner::initial_partitioning_threshold() {
-  if (helper::parallel_ip_mode(_input_ctx.initial_partitioning.mode)) {
+  if (helper::parallel_ip_mode(_input_ctx.partitioning.deep_initial_partitioning_mode)) {
     return _input_ctx.parallel.num_threads * _input_ctx.coarsening.contraction_limit; // p * C
   } else {
     return 2 * _input_ctx.coarsening.contraction_limit; // 2 * C
@@ -149,15 +149,16 @@ PartitionedGraph DeepMultilevelPartitioner::initial_partition(const Graph *graph
   // If requested, dump the coarsest graph to disk. Note that in the context of
   // deep multilevel, this is not actually the coarsest graph, but rather the
   // coarsest graph before splitting PEs and duplicating the graph.
-  // Disable worker splitting with --i-mode=sequential to obtain coarser graphs.
+  // Disable worker splitting with --p-deep-initial-partitioning-mode=sequential to obtain coarser
+  // graphs.
   debug::dump_coarsest_graph(*graph, _input_ctx.debug);
   debug::dump_graph_hierarchy(*graph, _coarsener->size(), _input_ctx.debug);
 
   // Since timers are not multi-threaded, we disable them during (parallel)
   // initial partitioning.
   DISABLE_TIMERS();
   PartitionedGraph p_graph = [&] {
-    switch (_input_ctx.initial_partitioning.mode) {
+    switch (_input_ctx.partitioning.deep_initial_partitioning_mode) {
     case InitialPartitioningMode::SEQUENTIAL:
       return helper::bipartition(graph, _input_ctx.partition.k, _input_ctx, _ip_m_ctx_pool);
 

diff --git a/kaminpar/partitioning/deep/sync_initial_partitioning.cc b/kaminpar/partitioning/deep/sync_initial_partitioning.cc
@@ -32,12 +32,16 @@ SyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCon
   std::atomic<bool> converged = false;
 
   std::vector<std::size_t> num_local_copies_record;
+
   while (num_current_copies < num_threads) {
     const NodeID n = coarseners.back()[0]->coarsest_graph()->n();
     const std::size_t num_local_copies =
         helper::compute_num_copies(_input_ctx, n, converged, num_current_threads);
     num_local_copies_record.push_back(num_local_copies);
 
+    DBG << V(num_current_copies) << V(num_threads) << V(num_current_threads) << V(num_local_copies);
+
+
     // Create coarseners and partition contexts for next coarsening iteration
     coarseners.emplace_back(num_current_copies * num_local_copies);
     auto &next_coarseners = coarseners.back();

diff --git a/kaminpar/partitioning/helper.cc b/kaminpar/partitioning/helper.cc
@@ -261,7 +261,8 @@ select_best(const scalable_vector<PartitionedGraph> &p_graphs, const PartitionCo
 }
 
 std::size_t compute_num_threads_for_parallel_ip(const Context &input_ctx) {
-  return math::floor2(static_cast<unsigned int>(input_ctx.parallel.num_threads)) *
-         (1 << input_ctx.initial_partitioning.multiplier_exponent);
+  return math::floor2(static_cast<unsigned int>(
+      1.0 * input_ctx.parallel.num_threads * input_ctx.partitioning.deep_initial_partitioning_load
+  ));
 }
 } // namespace kaminpar::shm::partitioning::helper
diff --git a/kaminpar/presets.cc b/kaminpar/presets.cc
@@ -45,8 +45,12 @@ std::unordered_set<std::string> get_preset_names() {
 
 Context create_default_context() {
   return {
-      .mode = PartitioningMode::DEEP,
-      // Context
+      .partitioning =
+          {
+              .mode = PartitioningMode::DEEP,
+              .deep_initial_partitioning_mode = InitialPartitioningMode::SYNCHRONOUS_PARALLEL,
+              .deep_initial_partitioning_load = 1.0,
+          },
       .partition =
           {
               // Context -> Partition
@@ -73,7 +77,6 @@ Context create_default_context() {
           },
       .initial_partitioning =
           {
-              .mode = InitialPartitioningMode::SYNCHRONOUS_PARALLEL,
               .coarsening =
                   {
                       .contraction_limit = 20,
@@ -97,7 +100,6 @@ Context create_default_context() {
               .max_num_repetitions = 50,
               .num_seed_iterations = 1,
               .use_adaptive_bipartitioner_selection = true,
-              .multiplier_exponent = 0,
           },
       .refinement =
           {
@@ -153,11 +155,11 @@ Context create_default_context() {
 
 Context create_fast_context() {
   Context ctx = create_default_context();
+  ctx.partitioning.deep_initial_partitioning_mode = InitialPartitioningMode::SEQUENTIAL;
   ctx.coarsening.lp.num_iterations = 1;
   ctx.initial_partitioning.min_num_repetitions = 1;
   ctx.initial_partitioning.min_num_non_adaptive_repetitions = 1;
   ctx.initial_partitioning.max_num_repetitions = 1;
-  ctx.initial_partitioning.mode = InitialPartitioningMode::SEQUENTIAL;
   return ctx;
 }
 

diff --git a/kaminpar_cli/kaminpar_arguments.cc b/kaminpar_cli/kaminpar_arguments.cc
@@ -34,12 +34,30 @@ CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx) {
       ->check(CLI::NonNegativeNumber)
       ->capture_default_str();
 
-  partitioning->add_option("-m,--mode", ctx.mode)
+  // Partitioning options
+  partitioning->add_option("-m,--p-mode", ctx.partitioning.mode)
       ->transform(CLI::CheckedTransformer(get_partitioning_modes()).description(""))
       ->description(R"(Partitioning scheme:
   - deep: deep multilevel
   - rb:   recursive multilevel bipartitioning)")
       ->capture_default_str();
+  partitioning
+      ->add_option(
+          "--p-deep-initial-partitioning-mode", ctx.partitioning.deep_initial_partitioning_mode
+      )
+      ->transform(CLI::CheckedTransformer(get_initial_partitioning_modes()).description(""))
+      ->description(R"(Chooses the initial partitioning mode:
+  - sequential:     do not diversify initial partitioning by replicating coarse graphs
+  - async-parallel: diversify initial partitioning by replicating coarse graphs each branch of the replication tree asynchronously
+  - sync-parallel:  same as async-parallel, but process branches synchronously)")
+      ->capture_default_str();
+  partitioning->add_option(
+      "--p-deep-initial-partitioning-load",
+      ctx.partitioning.deep_initial_partitioning_load,
+      "Fraction of cores that should be used for the coarse graph replication phase of deep MGP. A "
+      "value of '1' will replicate the graph once for every PE, whereas smaller values lead to "
+      "fewer replications."
+  );
 
   return partitioning;
 }
@@ -132,14 +150,6 @@ CLI::Option_group *create_lp_coarsening_options(CLI::App *app, Context &ctx) {
 CLI::Option_group *create_initial_partitioning_options(CLI::App *app, Context &ctx) {
   auto *ip = app->add_option_group("Initial Partitioning");
 
-  ip->add_option("--i-mode", ctx.initial_partitioning.mode)
-      ->transform(CLI::CheckedTransformer(get_initial_partitioning_modes()).description(""))
-      ->description(R"(Chooses the initial partitioning mode:
-  - sequential:     do not diversify initial partitioning by replicating coarse graphs
-  - async-parallel: diversify initial partitioning by replicating coarse graphs each branch of the replication tree asynchronously
-  - sync-parallel:  same as async-parallel, but process branches synchronously)")
-      ->capture_default_str();
-
   /*
   ip->add_option(
         "--i-c-contraction-limit",
@@ -152,10 +162,11 @@ CLI::Option_group *create_initial_partitioning_options(CLI::App *app, Context &c
   )
       ->transform(CLI::CheckedTransformer(get_cluster_weight_limits()).description(""))
       ->description(
-          R"(This option selects the formula used to compute the weight limit for nodes in coarse graphs. 
-The weight limit can additionally be scaled by a constant multiplier set by the --c-cluster-weight-multiplier option.
-Options are:
-  - epsilon-block-weight: Cmax = eps * c(V) * min{n' / C, k}, where n' is the number of nodes in the current (coarse) graph
+          R"(This option selects the formula used to compute the weight limit for nodes in coarse
+graphs. The weight limit can additionally be scaled by a constant multiplier set by the
+--c-cluster-weight-multiplier option. Options are:
+  - epsilon-block-weight: Cmax = eps * c(V) * min{n' / C, k}, where n' is the number of nodes in the
+current (coarse) graph
   - static-block-weight:  Cmax = c(V) / k
   - one:                  Cmax = 1
   - zero:                 Cmax = 0 (disable coarsening))"
@@ -177,8 +188,6 @@ Options are:
   */
 
   /*
-  ip->add_option("--i-rep-exp", ctx.initial_partitioning.multiplier_exponent)
-      ->capture_default_str();
   ip->add_option("--i-rep-multiplier", ctx.initial_partitioning.repetition_multiplier)
       ->capture_default_str();
   ip->add_option("--i-min-reps", ctx.initial_partitioning.min_num_repetitions)
@@ -200,7 +209,7 @@ Options are:
         "--i-r-disable", ctx.initial_partitioning.refinement.disabled, "Disable initial refinement."
   )
       ->capture_default_str();
-  
+
   /*
   ip->add_option("--i-r-stopping-rule", ctx.initial_partitioning.refinement.stopping_rule)
       ->transform(CLI::CheckedTransformer(get_fm_stopping_rules()).description(""))