From f6c8441a15f402e35a45109ec91726066d35d009 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Sat, 16 Nov 2024 21:04:04 -0500
Subject: [PATCH] PaRSEC now allows DSLs to free the gpu task

We can allocate the GPU task inside the task structure and avoid
an extra allocation.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 .../ExternalDependenciesVersions.cmake        |   2 +-
 ttg/ttg/parsec/task.h                         |  12 +-
 ttg/ttg/parsec/ttg.h                          | 128 ++++++++----------
 3 files changed, 66 insertions(+), 76 deletions(-)
diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index 49f66fe0a..fbc9c9955 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -4,7 +4,7 @@
 set(TTG_TRACKED_VG_CMAKE_KIT_TAG d1b34157c349cf0a7c2f149b7704a682d53f6486)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
 set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
-set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
+set(TTG_TRACKED_PARSEC_TAG a9ab33d8287578c68c0349662352f280bc83e2c0)
 set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
 set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815)
 
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index f29ca8ecb..2b3b06d60 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -11,20 +11,18 @@ namespace ttg_parsec {
   namespace detail {
 
     struct device_ptr_t {
-      parsec_gpu_task_t* gpu_task = nullptr;
+      parsec_gpu_task_t *gpu_task = nullptr;
       parsec_flow_t* flows = nullptr;
       parsec_gpu_exec_stream_t* stream = nullptr;
       parsec_device_gpu_module_t* device = nullptr;
       parsec_task_class_t task_class; // copy of the taskclass
     };
 
-    template<bool SupportDevice>
+    template<bool HasDeviceOp>
     struct device_state_t
     {
       static constexpr bool support_device = false;
       static constexpr size_t num_flows = 0;
-      device_state_t()
-      { }
       static constexpr device_ptr_t* dev_ptr() {
         return nullptr;
       }
@@ -32,10 +30,12 @@ namespace ttg_parsec {
 
     template<>
     struct device_state_t<true> {
-      static constexpr bool support_device = false;
+      static constexpr bool support_device = true;
       static constexpr size_t num_flows = MAX_PARAM_COUNT;
       parsec_flow_t m_flows[num_flows];
-      device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr, nullptr}; // gpu_task will be allocated in each task
+      parsec_gpu_task_t device_task;
+      device_ptr_t m_dev_ptr = {&device_task, &m_flows[0], nullptr, nullptr};
+
       device_ptr_t* dev_ptr() {
         return &m_dev_ptr;
       }
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 47ab79068..ed87ce228 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -1494,87 +1494,77 @@ namespace ttg_parsec {
       return rc;
     }
 
+    /* callback to set in the device task structure */
+    static void release_device_task(void *ptr) {
+      /* nothing to do */
+    }
+
     template <ttg::ExecutionSpace Space>
     static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) {
 
       task_t *task = (task_t*)parsec_task;
-      if (task->dev_ptr->gpu_task == nullptr) {
-
-        /* set up a device task */
-        parsec_gpu_task_t *gpu_task;
-        /* PaRSEC wants to free the gpu_task, because F***K ownerships */
-        gpu_task = static_cast<parsec_gpu_task_t*>(std::calloc(1, sizeof(*gpu_task)));
-        PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
-        gpu_task->ec = parsec_task;
-        gpu_task->task_type = 0; // user task
-        gpu_task->last_data_check_epoch = 0; // used internally
-        gpu_task->pushout = 0;
-        gpu_task->submit = &TT::device_static_submit<Space>;
-
-        // one way to force the task device
-        // currently this will probably break all of PaRSEC if this hint
-        // does not match where the data is located, not really useful for us
-        // instead we set a hint on the data if there is no hint set yet
-        //parsec_task->selected_device = ...;
-
-        /* set the gpu_task so it's available in register_device_memory */
-        task->dev_ptr->gpu_task = gpu_task;
-
-        /* TODO: is this the right place to set the mask? */
-        task->parsec_task.chore_mask = PARSEC_DEV_ALL;
 
-        /* copy over the task class, because that's what we need */
-        task->dev_ptr->task_class = *task->parsec_task.task_class;
-
-        // first invocation of the coroutine to get the coroutine handle
-        static_op<Space>(parsec_task);
-
-        /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
-
-        parsec_task_class_t& tc = task->dev_ptr->task_class;
-
-        // input flows are set up during register_device_memory as part of the first invocation above
-        for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-          tc.in[i]  = gpu_task->flow[i];
-          tc.out[i] = gpu_task->flow[i];
+      /* set up the device task */
+      parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task;
+      /* TODO: needed? */
+      std::memset(gpu_task, 0, sizeof(gpu_task));
+      /* construct the GPU task */
+      PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
+      gpu_task->ec = parsec_task;
+      gpu_task->task_type = 0; // user task
+      gpu_task->last_data_check_epoch = 0; // used internally
+      gpu_task->pushout = 0;
+      gpu_task->submit = &TT::device_static_submit<Space>;
+      gpu_task->release_device_task = &release_device_task;
+
+      /* TODO: is this the right place to set the mask? */
+      task->parsec_task.chore_mask = PARSEC_DEV_ALL;
+
+      /* copy over the task class, because that's what we need */
+      task->dev_ptr->task_class = *task->parsec_task.task_class;
+
+      // first invocation of the coroutine to get the coroutine handle
+      static_op<Space>(parsec_task);
+
+      /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
+
+      parsec_task_class_t& tc = task->dev_ptr->task_class;
+
+      // input flows are set up during register_device_memory as part of the first invocation above
+      for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+        tc.in[i]  = gpu_task->flow[i];
+        tc.out[i] = gpu_task->flow[i];
+      }
+      tc.nb_flows = MAX_PARAM_COUNT;
+
+      /* set the device hint on the data */
+      TT *tt = task->tt;
+      if (tt->devicemap) {
+        int parsec_dev;
+        if constexpr (std::is_void_v<keyT>) {
+          parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap());
+        } else {
+          parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key));
         }
-        tc.nb_flows = MAX_PARAM_COUNT;
-
-        /* set the device hint on the data */
-        TT *tt = task->tt;
-        if (tt->devicemap) {
-          int parsec_dev;
-          if constexpr (std::is_void_v<keyT>) {
-            parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap());
-          } else {
-            parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key));
-          }
-          for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-            /* only set on mutable data since we have exclusive access */
-            if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) {
-              parsec_data_t *data = parsec_task->data[i].data_in->original;
-              /* only set the preferred device if the host has the latest copy
-               * as otherwise we may end up with the wrong data if there is a newer
-               * version on a different device. Also, keep fingers crossed. */
-              if (data->owner_device == 0) {
-                parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
-              }
+        for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+          /* only set on mutable data since we have exclusive access */
+          if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) {
+            parsec_data_t *data = parsec_task->data[i].data_in->original;
+            /* only set the preferred device if the host has the latest copy
+              * as otherwise we may end up with the wrong data if there is a newer
+              * version on a different device. Also, keep fingers crossed. */
+            if (data->owner_device == 0) {
+              parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
             }
           }
         }
-
-        /* set the new task class that contains the flows */
-        task->parsec_task.task_class = &task->dev_ptr->task_class;
-
-        /* select this one */
-        return PARSEC_HOOK_RETURN_DONE;
       }
 
-      std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl;
-
-      /* not sure if this might happen*/
-      return PARSEC_HOOK_RETURN_ERROR;
+      /* set the new task class that contains the flows */
+      task->parsec_task.task_class = &task->dev_ptr->task_class;
 
+      /* select this one */
+      return PARSEC_HOOK_RETURN_DONE;
     }
 
     template <ttg::ExecutionSpace Space>