TESSEorg · devreal · Nov 17, 2024 · Dec 23, 2024 · Jan 9, 2025 · Jan 9, 2025
diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -4,7 +4,7 @@
 set(TTG_TRACKED_VG_CMAKE_KIT_TAG 878654d0cb1904049fbd2c37b37d5385ae897658)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
 set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
-set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
+set(TTG_TRACKED_PARSEC_TAG c97e2fc54698d3d937d7847a12c7e9084b22a6c8)
 set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
 set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815)
 

diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake
@@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec)
 
   FetchContent_Declare(
           PARSEC
-          GIT_REPOSITORY      https://github.com/ICLDisco/parsec.git
+          GIT_REPOSITORY      https://github.com/bosilca/parsec.git
           GIT_TAG             ${TTG_TRACKED_PARSEC_TAG}
   )
   FetchContent_MakeAvailable(PARSEC)

diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
@@ -16,7 +16,6 @@ namespace ttg_parsec {
       parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
       assert(nullptr != caller->dev_ptr);
       parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
-      parsec_flow_t *flows = caller->dev_ptr->flows;
 
       auto& view = std::get<I>(views);
       bool is_current = false;
@@ -38,15 +37,15 @@ namespace ttg_parsec {
         }
 
         /* build the flow */
-        /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
-        flows[I] = parsec_flow_t{.name = nullptr,
+        *((parsec_flow_t*)gpu_task->flow_info[I].flow) =
+                  parsec_flow_t{.name = nullptr,
                                 .sym_type = PARSEC_SYM_INOUT,
                                 .flow_flags = static_cast<uint8_t>(access),
                                 .flow_index = I,
                                 .flow_datatype_mask = ~0 };
 
-        gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
-        gpu_task->flow[I] = &flows[I];
+        gpu_task->flow_info[I].flow_span = data->span; // size in bytes
+        gpu_task->flow_info[I].flow_dc = nullptr;
 
         /* set the input data copy, parsec will take care of the transfer
         * and the buffer will look at the parsec_data_t for the current pointer */
@@ -57,13 +56,13 @@ namespace ttg_parsec {
 
       } else {
         /* ignore the flow */
-        flows[I] = parsec_flow_t{.name = nullptr,
+        *((parsec_flow_t*)gpu_task->flow_info[I].flow) =
+                   parsec_flow_t{.name = nullptr,
                                  .sym_type = PARSEC_FLOW_ACCESS_NONE,
                                  .flow_flags = 0,
                                  .flow_index = I,
                                  .flow_datatype_mask = ~0 };
-        gpu_task->flow[I] = &flows[I];
-        gpu_task->flow_nb_elts[I] = 0; // size in bytes
+        gpu_task->flow_info[I].flow_span = 0; // size in bytes
         caller->parsec_task.data[I].data_in = nullptr;
       }
 
@@ -80,6 +79,7 @@ namespace ttg_parsec {
   template<typename... Views>
   bool register_device_memory(std::tuple<Views&...> &views) {
     bool is_current = true;
+    constexpr const std::size_t num_views = sizeof...(Views);
     if (nullptr == detail::parsec_ttg_caller) {
       throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
     }
@@ -88,19 +88,13 @@ namespace ttg_parsec {
       throw std::runtime_error("register_device_memory called inside a non-gpu task!");
     }
 
+    auto task = detail::parsec_ttg_caller;
+    task->dev_ptr->gpu_task->allocate_flows(num_views);
+
     if constexpr (sizeof...(Views) > 0) {
       is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
     }
 
-    /* reset all entries in the current task */
-    for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
-      detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
-      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
-      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
-      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i];
-      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
-    }
-
     return is_current;
   }
 
@@ -120,8 +114,8 @@ namespace ttg_parsec {
     uint8_t i; // only limited number of flows
     detail::parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
     assert(nullptr != caller->dev_ptr);
+    caller->dev_ptr->gpu_task->allocate_flows(span.size());
     parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
-    parsec_flow_t *flows = caller->dev_ptr->flows;
 
     bool is_current = false;
     for (i = 0; i < span.size(); ++i) {
@@ -146,14 +140,15 @@ namespace ttg_parsec {
 
         /* build the flow */
         /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
-        flows[i] = parsec_flow_t{.name = nullptr,
+        *((parsec_flow_t*)gpu_task->flow_info[i].flow) =
+                  parsec_flow_t{.name = nullptr,
                                 .sym_type = PARSEC_SYM_INOUT,
                                 .flow_flags = static_cast<uint8_t>(access),
                                 .flow_index = i,
                                 .flow_datatype_mask = ~0 };
 
-        gpu_task->flow_nb_elts[i] = data->nb_elts; // size in bytes
-        gpu_task->flow[i] = &flows[i];
+        gpu_task->flow_info[i].flow_span = data->span; // size in bytes
+        gpu_task->flow_info[i].flow_dc = nullptr;
 
         /* set the input data copy, parsec will take care of the transfer
         * and the buffer will look at the parsec_data_t for the current pointer */
@@ -164,25 +159,17 @@ namespace ttg_parsec {
 
       } else {
         /* ignore the flow */
-        flows[i] = parsec_flow_t{.name = nullptr,
+        *((parsec_flow_t*)gpu_task->flow_info[i].flow) =
+                   parsec_flow_t{.name = nullptr,
                                  .sym_type = PARSEC_FLOW_ACCESS_NONE,
                                  .flow_flags = 0,
                                  .flow_index = i,
                                  .flow_datatype_mask = ~0 };
-        gpu_task->flow[i] = &flows[i];
-        gpu_task->flow_nb_elts[i] = 0; // size in bytes
+        gpu_task->flow_info[i].flow_span = 0; // size in bytes
         caller->parsec_task.data[i].data_in = nullptr;
       }
     }
 
-    /* reset all remaining entries in the current task */
-    for (; i < MAX_PARAM_COUNT; ++i) {
-      detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
-      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
-      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
-      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i];
-      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
-    }
     // we cannot allow the calling thread to submit kernels so say we're not ready
     return is_current;
   }
@@ -204,7 +191,7 @@ namespace ttg_parsec {
         int ret = device_module->memcpy_async(device_module, stream,
                                               data->device_copies[0]->device_private,
                                               data->device_copies[data->owner_device]->device_private,
-                                              data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
+                                              data->span, parsec_device_gpu_transfer_direction_d2h);
         assert(ret == PARSEC_SUCCESS);
       }
       if constexpr (sizeof...(Is) > 0) {

diff --git a/ttg/ttg/parsec/devicescratch.h b/ttg/ttg/parsec/devicescratch.h
@@ -50,19 +50,6 @@ struct devicescratch {
     return data;
   }
 
-  void remove_from_flow() {
-    /* remove the scratch from the gpu-task flow */
-    assert(nullptr != detail::parsec_ttg_caller);
-    parsec_task_t *parsec_task = &detail::parsec_ttg_caller->parsec_task;
-    parsec_flow_t *flows = detail::parsec_ttg_caller->dev_ptr->flows;
-    for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-      if (nullptr != parsec_task->data[i].data_in && parsec_task->data[i].data_in->original == m_data) {
-        flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; // disable this flow
-        break;
-      }
-    }
-  }
-
   friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::devicescratch<T>&);
 
 public:
@@ -93,8 +80,6 @@ struct devicescratch {
   devicescratch& operator=(const devicescratch& db) = delete;
 
   ~devicescratch() {
-    /* remove data from flow */
-    //remove_from_flow();
     if (nullptr != m_data) {
       //parsec_data_destroy(m_data);
       //parsec_data_copy_detach(m_data, parsec_data_get_copy(m_data, 0), 0);
@@ -128,7 +113,7 @@ struct devicescratch {
   }
 
   std::size_t size() const {
-    return (m_data->nb_elts / sizeof(element_type));
+    return (m_data->span / sizeof(element_type));
   }
 
 };

diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
@@ -10,32 +10,52 @@ namespace ttg_parsec {
 
   namespace detail {
 
+    struct parsec_ttg_gpu_task_t : public parsec_gpu_task_t {
+      std::byte *memory = nullptr;
+      void allocate_flows(std::size_t size) {
+        if (this->memory != nullptr) free_flows();
+        constexpr const auto align = std::align_val_t(std::max(alignof(parsec_flow_t), alignof(parsec_gpu_flow_info_t)));
+        this->memory = new(align) std::byte[size * (sizeof(parsec_flow_t) + sizeof(parsec_gpu_flow_info_s))];
+        parsec_flow_t *flows = (parsec_flow_t*)this->memory;
+        this->flow_info = (parsec_gpu_flow_info_t*)(this->memory + size * sizeof(parsec_flow_t));
+        for (std::size_t i = 0; i < size; ++i) {
+          this->flow_info[i].flow = &flows[i];
+          flows[i].flow_index = i;
+          flows[i].flow_flags = 0;
+          flows[i].flow_datatype_mask = ~0;
+        }
+        this->nb_flows  = size;
+      }
+
+      void free_flows() {
+        if (this->memory != nullptr) {
+          delete[] this->memory;
+          this->memory = nullptr;
+        }
+      }
+    };
+
     struct device_ptr_t {
-      parsec_gpu_task_t* gpu_task = nullptr;
-      parsec_flow_t* flows = nullptr;
+      parsec_ttg_gpu_task_t *gpu_task = nullptr;
       parsec_gpu_exec_stream_t* stream = nullptr;
       parsec_device_gpu_module_t* device = nullptr;
-      parsec_task_class_t task_class; // copy of the taskclass
     };
 
-    template<bool SupportDevice>
+    template<bool HasDeviceOp>
     struct device_state_t
     {
       static constexpr bool support_device = false;
-      static constexpr size_t num_flows = 0;
-      device_state_t()
-      { }
       static constexpr device_ptr_t* dev_ptr() {
         return nullptr;
       }
     };
 
     template<>
     struct device_state_t<true> {
-      static constexpr bool support_device = false;
-      static constexpr size_t num_flows = MAX_PARAM_COUNT;
-      parsec_flow_t m_flows[num_flows];
-      device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr, nullptr}; // gpu_task will be allocated in each task
+      static constexpr bool support_device = true;
+      parsec_ttg_gpu_task_t device_task;
+      device_ptr_t m_dev_ptr = {&device_task, nullptr, nullptr};
+
       device_ptr_t* dev_ptr() {
         return &m_dev_ptr;
       }