*: try a 2D decomposition of GPU SpMM

rohany · Mar 1, 2022 · 8899065 · 8899065
1 parent e58a72a
commit 8899065
Show file tree

Hide file tree

Showing 11 changed files with 572 additions and 33 deletions.
diff --git a/legion/include/taco_mapper.h b/legion/include/taco_mapper.h
@@ -197,7 +197,9 @@ class TACOMapper : public Legion::Mapping::DefaultMapper {
   // InFlightTask represents a task currently being executed.
   struct InFlightTask {
     // Unique identifier of the task instance.
-    Legion::UniqueID id;
+    // Legion::UniqueID id;
+    // TODO (rohany): Comment.
+    std::pair<Legion::Domain, size_t> id;
     // An event that will be triggered when the task finishes.
     Legion::Mapping::MapperEvent event;
     // A clock measurement from when the task was scheduled.

diff --git a/legion/spinnerprod/taco-generated.cpp b/legion/spinnerprod/taco-generated.cpp
@@ -458,13 +458,13 @@ double computeLegionDDS(Legion::Context ctx, Legion::Runtime* runtime, LegionTen
 void registerTacoTasks() {
   {
     TaskVariantRegistrar registrar(taskID(1), "task_1");
-    registrar.add_constraint(ProcessorConstraint(Processor::OMP_PROC));
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     registrar.set_leaf();
     Runtime::preregister_task_variant<double,task_1>(registrar, "task_1");
   }
   {
     TaskVariantRegistrar registrar(taskID(2), "task_2");
-    registrar.add_constraint(ProcessorConstraint(Processor::OMP_PROC));
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     registrar.set_leaf();
     Runtime::preregister_task_variant<double,task_2>(registrar, "task_2");
   }

diff --git a/legion/spmm/main.cpp b/legion/spmm/main.cpp
@@ -6,30 +6,38 @@
 #include "legion_utils.h"
 #include "legion_string_utils.h"
 #include "error.h"
+#ifdef TACO_USE_CUDA
+#include "taco-generated.cuh"
+#else
 #include "taco-generated.h"
+#endif
 
 using namespace Legion;
 typedef double valType;
 
 void top_level_task(const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx, Runtime* runtime) {
   std::string csrFileName;
-  bool dump = false;
+  bool dump = false, consMem = false;
   // The j-dimension if the computation will commonly have a small value
   // that is divisible by 32, as per Stephen and Chang-wan.
-  int n = 10, pieces = 0, warmup = 5, jDim = 32;
+  int n = 10, pieces = 0, warmup = 5, jDim = 32, gx = 0, gy = 0;
   Realm::CommandLineParser parser;
   parser.add_option_string("-tensor", csrFileName);
   parser.add_option_bool("-dump", dump);
   parser.add_option_int("-n", n);
   parser.add_option_int("-pieces", pieces);
   parser.add_option_int("-warmup", warmup);
   parser.add_option_int("-jdim", jDim);
+  // Arguments for memory-conserving schedule.
+  parser.add_option_bool("-consMem", consMem);
+  parser.add_option_int("-gx", gx);
+  parser.add_option_int("-gy", gy);
   auto args = Runtime::get_input_args();
   taco_uassert(parser.parse_command_line(args.argc, args.argv)) << "Parse failure.";
   taco_uassert(!csrFileName.empty()) << "Provide a matrix with -tensor";
 
   // Figure out how many pieces to chop up the data into.
-  if (pieces == 0) {
+  if (!consMem && pieces == 0) {
     pieces = getNumPieces(ctx, runtime);
     taco_uassert(pieces != 0) << "Please provide a number of pieces to split into with -pieces. Unable to automatically find.";
   }
@@ -41,20 +49,41 @@ void top_level_task(const Task* task, const std::vector<PhysicalRegion>& regions
   runtime->fill_field(ctx, A.vals, A.valsParent, FID_VAL, valType(0));
   runtime->fill_field(ctx, C.vals, C.valsParent, FID_VAL, valType(1));
 
-  auto pack = partitionForcomputeLegion(ctx, runtime, &A, &B, &C, pieces);
+// #ifdef TACO_USE_CUDA
+  partitionPackForcomputeLegion pack;
+  partitionPackForcomputeLegionConsMem packCons;
+  if (consMem) {
+    packCons = partitionForcomputeLegionConsMem(ctx, runtime, &A, &B, &C, gx, gy);
+  } else {
+    pack = partitionForcomputeLegion(ctx, runtime, &A, &B, &C, pieces);
+  }
+// #else
+//   auto pack = partitionForcomputeLegion(ctx, runtime, &A, &B, &C, pieces);
+//   taco_iassert(!consMem);
+// #endif
 
-  auto commPart = createSparseAliasingPartitions(ctx, runtime, A.vals.get_index_space(), pack.APartition.valsPartition.get_index_partition());
-  auto commLPart = runtime->get_logical_partition(ctx, A.vals, commPart);
+
+  LogicalPartition commLPart;
+  if (!consMem) {
+    auto commPart = createSparseAliasingPartitions(ctx, runtime, A.vals.get_index_space(), pack.APartition.valsPartition.get_index_partition());
+    commLPart = runtime->get_logical_partition(ctx, A.vals, commPart);
+  }
 
   auto avgTime = benchmarkAsyncCallWithWarmup(ctx, runtime, warmup, n, [&]() {
     if (dump) { runtime->fill_field(ctx, A.vals, A.valsParent, FID_VAL, valType(0)); }
-    computeLegion(ctx, runtime, &A, &B, &C, &pack, pieces);
-#ifdef TACO_USE_CUDA
-    // Collapse our reduction buffers. We use sparse instances to force just the communication
-    // that we want. We only do this for the GPU schedule, as the CPU schedule does not
-    // use Legion reductions.
-    launchDummyReadOverPartition(ctx, runtime, A.vals, commLPart, FID_VAL, Rect<1>(0, pieces - 1), false /* wait */, true /* untrack */, false /* cpuOnly */, true /* sparse */);
-#endif
+// #ifdef TACO_USE_CUDA
+    if (consMem) {
+      computeLegionConsMem(ctx, runtime, &A, &B, &C, &packCons, gx, gy);
+    } else {
+      computeLegion(ctx, runtime, &A, &B, &C, &pack, pieces);
+      // Collapse our reduction buffers. We use sparse instances to force just the communication
+      // that we want. We only do this for the GPU schedule, as the CPU schedule does not
+      // use Legion reductions.
+      launchDummyReadOverPartition(ctx, runtime, A.vals, commLPart, FID_VAL, Rect<1>(0, pieces - 1), false /* wait */, true /* untrack */, false /* cpuOnly */, true /* sparse */);
+    }
+// #else
+//     computeLegion(ctx, runtime, &A, &B, &C, &pack, pieces);
+// #endif
   });
   LEGION_PRINT_ONCE(runtime, ctx, stdout, "Average execution time: %lf ms\n", avgTime);
 

diff --git a/legion/spmm/taco-generated.cpp b/legion/spmm/taco-generated.cpp
@@ -18,6 +18,14 @@ struct task_1Args {
   int32_t gx;
 };
 
+struct task_2Args {
+  int64_t A2_dimension;
+  int64_t B1_dimension;
+  int64_t C2_dimension;
+  int32_t gx;
+  int32_t gy;
+};
+
 
 partitionPackForcomputeLegion partitionForcomputeLegion(Legion::Context ctx, Legion::Runtime* runtime, LegionTensor* A, LegionTensor* B, LegionTensor* C, int32_t gx) {
   RegionWrapper A_vals = A->vals;
@@ -164,12 +172,192 @@ void computeLegion(Legion::Context ctx, Legion::Runtime* runtime, LegionTensor*
   launcher.add_region_requirement(RegionRequirement(get_logical_region(C_vals), READ_ONLY, EXCLUSIVE, C_vals_parent).add_field(FID_VAL));
   runtime->execute_index_space(ctx, launcher);
 
+}
+
+partitionPackForcomputeLegionConsMem partitionForcomputeLegionConsMem(Legion::Context ctx, Legion::Runtime* runtime, LegionTensor* A, LegionTensor* B, LegionTensor* C, int32_t gx, int32_t gy) {
+  RegionWrapper A_vals = A->vals;
+  IndexSpace A_dense_run_0 = A->denseLevelRuns[0];
+  int B1_dimension = B->dims[0];
+  int B2_dimension = B->dims[1];
+  RegionWrapper B2_pos = B->indices[1][0];
+  RegionWrapper B2_crd = B->indices[1][1];
+  auto B2_pos_parent = B->indicesParents[1][0];
+  RegionWrapper B_vals = B->vals;
+  IndexSpace B_dense_run_0 = B->denseLevelRuns[0];
+  int C2_dimension = C->dims[1];
+  RegionWrapper C_vals = C->vals;
+  IndexSpace C_dense_run_0 = C->denseLevelRuns[0];
+
+  int64_t B2Size = runtime->get_index_space_domain(ctx, get_index_space(B2_crd)).hi()[0] + 1;
+
+  Point<2> lowerBound = Point<2>(0, 0);
+  Point<2> upperBound = Point<2>((gx - 1), (gy - 1));
+  auto distFusedIndexSpace = runtime->create_index_space(ctx, Rect<2>(lowerBound, upperBound));
+  DomainT<2> domain = runtime->get_index_space_domain(ctx, IndexSpaceT<2>(distFusedIndexSpace));
+  auto ADomain = runtime->get_index_space_domain(ctx, A_dense_run_0);
+  auto BDomain = runtime->get_index_space_domain(ctx, B_dense_run_0);
+  auto CDomain = runtime->get_index_space_domain(ctx, C_dense_run_0);
+  DomainPointColoring AColoring = DomainPointColoring();
+  DomainPointColoring BColoring = DomainPointColoring();
+  DomainPointColoring CColoring = DomainPointColoring();
+  for (PointInDomainIterator<2> itr = PointInDomainIterator<2>(domain); itr.valid(); itr++) {
+    int64_t io = (*itr)[0];
+    int64_t jo = (*itr)[1];
+    Point<2> AStart = Point<2>((io * ((B1_dimension + (gx - 1)) / gx)), (jo * ((C2_dimension + (gy - 1)) / gy)));
+    Point<2> AEnd = Point<2>(TACO_MIN((io * ((B1_dimension + (gx - 1)) / gx) + ((B1_dimension + (gx - 1)) / gx - 1)), ADomain.hi()[0]), TACO_MIN((jo * ((C2_dimension + (gy - 1)) / gy) + ((C2_dimension + (gy - 1)) / gy - 1)), ADomain.hi()[1]));
+    Rect<2> ARect = Rect<2>(AStart, AEnd);
+    if (!ADomain.contains(ARect.lo) || !ADomain.contains(ARect.hi)) {
+      ARect = ARect.make_empty();
+    }
+    AColoring[(*itr)] = ARect;
+    Point<1> BStart = Point<1>((io * ((B1_dimension + (gx - 1)) / gx)));
+    Point<1> BEnd = Point<1>(TACO_MIN((io * ((B1_dimension + (gx - 1)) / gx) + ((B1_dimension + (gx - 1)) / gx - 1)), BDomain.hi()[0]));
+    Rect<1> BRect = Rect<1>(BStart, BEnd);
+    if (!BDomain.contains(BRect.lo) || !BDomain.contains(BRect.hi)) {
+      BRect = BRect.make_empty();
+    }
+    BColoring[(*itr)] = BRect;
+    Point<2> CStart = Point<2>(0, (jo * ((C2_dimension + (gy - 1)) / gy)));
+    Point<2> CEnd = Point<2>(TACO_MIN(B2_dimension, CDomain.hi()[0]), TACO_MIN((jo * ((C2_dimension + (gy - 1)) / gy) + ((C2_dimension + (gy - 1)) / gy - 1)), CDomain.hi()[1]));
+    Rect<2> CRect = Rect<2>(CStart, CEnd);
+    if (!CDomain.contains(CRect.lo) || !CDomain.contains(CRect.hi)) {
+      CRect = CRect.make_empty();
+    }
+    CColoring[(*itr)] = CRect;
+  }
+  auto A_dense_run_0_Partition = runtime->create_index_partition(ctx, A_dense_run_0, domain, AColoring, LEGION_DISJOINT_COMPLETE_KIND);
+  auto A_vals_partition = copyPartition(ctx, runtime, A_dense_run_0_Partition, get_logical_region(A_vals));
+  auto B_dense_run_0_Partition = runtime->create_index_partition(ctx, B_dense_run_0, domain, BColoring, LEGION_ALIASED_COMPLETE_KIND);
+  LogicalPartition posPartB2 = copyPartition(ctx, runtime, B_dense_run_0_Partition, B2_pos);
+  LogicalPartition crdPartB2 = runtime->get_logical_partition(ctx, B2_crd, RectCompressedPosPartitionDownwards::apply(ctx, runtime, B2_crd.get_index_space(), posPartB2, B2_pos_parent, FID_RECT_1));
+  auto B_vals_partition = copyPartition(ctx, runtime, crdPartB2, get_logical_region(B_vals));
+  auto C_dense_run_0_Partition = runtime->create_index_partition(ctx, C_dense_run_0, domain, CColoring, LEGION_ALIASED_COMPLETE_KIND);
+  auto C_vals_partition = copyPartition(ctx, runtime, C_dense_run_0_Partition, get_logical_region(C_vals));
+  auto computePartitions = partitionPackForcomputeLegionConsMem();
+  computePartitions.APartition.indicesPartitions = std::vector<std::vector<LogicalPartition>>(2);
+  computePartitions.APartition.denseLevelRunPartitions = std::vector<IndexPartition>(2);
+  computePartitions.APartition.valsPartition = A_vals_partition;
+  computePartitions.APartition.denseLevelRunPartitions[0] = A_dense_run_0_Partition;
+  computePartitions.BPartition.indicesPartitions = std::vector<std::vector<LogicalPartition>>(2);
+  computePartitions.BPartition.denseLevelRunPartitions = std::vector<IndexPartition>(2);
+  computePartitions.BPartition.indicesPartitions[1].push_back(posPartB2);
+  computePartitions.BPartition.indicesPartitions[1].push_back(crdPartB2);
+  computePartitions.BPartition.valsPartition = B_vals_partition;
+  computePartitions.BPartition.denseLevelRunPartitions[0] = B_dense_run_0_Partition;
+  computePartitions.CPartition.indicesPartitions = std::vector<std::vector<LogicalPartition>>(2);
+  computePartitions.CPartition.denseLevelRunPartitions = std::vector<IndexPartition>(2);
+  computePartitions.CPartition.valsPartition = C_vals_partition;
+  computePartitions.CPartition.denseLevelRunPartitions[0] = C_dense_run_0_Partition;
+
+  return computePartitions;
+}
+
+void task_2(const Task* task, const std::vector<PhysicalRegion>& regions, Context ctx, Runtime* runtime) {
+  PhysicalRegion A_vals = regions[0];
+  LogicalRegion A_vals_parent = regions[0].get_logical_region();
+  PhysicalRegion B2_pos = regions[1];
+  LogicalRegion B2_pos_parent = regions[1].get_logical_region();
+  PhysicalRegion B2_crd = regions[2];
+  LogicalRegion B2_crd_parent = regions[2].get_logical_region();
+  PhysicalRegion B_vals = regions[3];
+  LogicalRegion B_vals_parent = regions[3].get_logical_region();
+  PhysicalRegion C_vals = regions[4];
+  LogicalRegion C_vals_parent = regions[4].get_logical_region();
+
+  int64_t distFused = task->index_point[0];
+  task_2Args* args = (task_2Args*)(task->args);
+  int64_t A2_dimension = args->A2_dimension;
+  int64_t B1_dimension = args->B1_dimension;
+  int64_t C2_dimension = args->C2_dimension;
+  int32_t gx = args->gx;
+  int32_t gy = args->gy;
+
+  auto B_vals_ro_accessor = createAccessor<AccessorROdouble1>(B_vals, FID_VAL);
+  auto C_vals_ro_accessor = createAccessor<AccessorROdouble2>(C_vals, FID_VAL);
+  auto A_vals_rw_accessor = createAccessor<AccessorRWdouble2>(A_vals, FID_VAL);
+  auto B2_pos_accessor = createAccessor<AccessorRORect_1_1>(B2_pos, FID_RECT_1);
+  auto B2_crd_accessor = createAccessor<AccessorROint32_t1>(B2_crd, FID_COORD);
+
+  int64_t io = getIndexPoint(task, 0);
+  int64_t jo = getIndexPoint(task, 1);
+  int64_t pointID1 = io;
+  int64_t pointID2 = pointID1 * gy + jo;
+  for (int64_t ii = 0; ii < ((B1_dimension + (gx - 1)) / gx); ii++) {
+    int64_t i = io * ((B1_dimension + (gx - 1)) / gx) + ii;
+    if (i >= B1_dimension)
+      continue;
+
+    if (i >= (io + 1) * ((B1_dimension + (gx - 1)) / gx))
+      continue;
+
+    int64_t pointID3 = pointID2 * ((B1_dimension + (gx - 1)) / gx) + ii;
+    int64_t iA = i;
+    int64_t iB = i;
+    for (int64_t ji = 0; ji < ((C2_dimension + (gy - 1)) / gy); ji++) {
+      int64_t j = jo * ((C2_dimension + (gy - 1)) / gy) + ji;
+      if (j >= C2_dimension)
+        continue;
+
+      if (j >= (jo + 1) * ((C2_dimension + (gy - 1)) / gy))
+        continue;
+
+      int64_t pointID4 = pointID3 * ((C2_dimension + (gy - 1)) / gy) + ji;
+      int64_t jA = iA * A2_dimension + j;
+      for (int64_t kB = B2_pos_accessor[Point<1>(i)].lo; kB < (B2_pos_accessor[Point<1>(i)].hi + 1); kB++) {
+        int64_t k = B2_crd_accessor[(kB * 1)];
+        int64_t kC = k;
+        int64_t jC = kC * C2_dimension + j;
+        A_vals_rw_accessor[Point<2>(i, j)] = A_vals_rw_accessor[Point<2>(i, j)] + B_vals_ro_accessor[Point<1>(kB)] * C_vals_ro_accessor[Point<2>(k, j)];
+      }
+    }
+  }
+}
+
+void computeLegionConsMem(Legion::Context ctx, Legion::Runtime* runtime, LegionTensor* A, LegionTensor* B, LegionTensor* C, partitionPackForcomputeLegionConsMem* partitionPack, int32_t gx, int32_t gy) {
+  int A2_dimension = A->dims[1];
+  auto A_vals_parent = A->valsParent;
+  int B1_dimension = B->dims[0];
+  RegionWrapper B2_crd = B->indices[1][1];
+  auto B2_pos_parent = B->indicesParents[1][0];
+  auto B2_crd_parent = B->indicesParents[1][1];
+  auto B_vals_parent = B->valsParent;
+  int C2_dimension = C->dims[1];
+  auto C_vals_parent = C->valsParent;
+
+  int64_t B2Size = runtime->get_index_space_domain(ctx, get_index_space(B2_crd)).hi()[0] + 1;
+
+  Point<2> lowerBound = Point<2>(0, 0);
+  Point<2> upperBound = Point<2>((gx - 1), (gy - 1));
+  auto distFusedIndexSpace = runtime->create_index_space(ctx, Rect<2>(lowerBound, upperBound));
+  DomainT<2> domain = runtime->get_index_space_domain(ctx, IndexSpaceT<2>(distFusedIndexSpace));
+  task_2Args taskArgsRaw2;
+  taskArgsRaw2.A2_dimension = A2_dimension;
+  taskArgsRaw2.B1_dimension = B1_dimension;
+  taskArgsRaw2.C2_dimension = C2_dimension;
+  taskArgsRaw2.gx = gx;
+  taskArgsRaw2.gy = gy;
+  TaskArgument taskArgs = TaskArgument(&taskArgsRaw2, sizeof(task_2Args));
+  IndexLauncher launcher = IndexLauncher(taskID(2), domain, taskArgs, ArgumentMap());
+  launcher.add_region_requirement(RegionRequirement(partitionPack->APartition.valsPartition, 0, READ_WRITE, EXCLUSIVE, A_vals_parent).add_field(FID_VAL));
+  launcher.add_region_requirement(RegionRequirement(partitionPack->BPartition.indicesPartitions[1][0], 0, READ_ONLY, EXCLUSIVE, get_logical_region(B2_pos_parent)).add_field(FID_RECT_1));
+  launcher.add_region_requirement(RegionRequirement(partitionPack->BPartition.indicesPartitions[1][1], 0, READ_ONLY, EXCLUSIVE, get_logical_region(B2_crd_parent)).add_field(FID_COORD));
+  launcher.add_region_requirement(RegionRequirement(partitionPack->BPartition.valsPartition, 0, READ_ONLY, EXCLUSIVE, B_vals_parent).add_field(FID_VAL));
+  launcher.add_region_requirement(RegionRequirement(partitionPack->CPartition.valsPartition, 0, READ_ONLY, EXCLUSIVE, C_vals_parent).add_field(FID_VAL));
+  launcher.tag |= TACOMapper::BACKPRESSURE_TASK;
+  runtime->execute_index_space(ctx, launcher);
+
 }
 void registerTacoTasks() {
   {
     TaskVariantRegistrar registrar(taskID(1), "task_1");
-    registrar.add_constraint(ProcessorConstraint(Processor::OMP_PROC));
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
     registrar.set_leaf();
     Runtime::preregister_task_variant<task_1>(registrar, "task_1");
   }
+  {
+    TaskVariantRegistrar registrar(taskID(2), "task_2");
+    registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+    registrar.set_leaf();
+    Runtime::preregister_task_variant<task_2>(registrar, "task_2");
+  }
 }