From f6c8441a15f402e35a45109ec91726066d35d009 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 16 Nov 2024 21:04:04 -0500 Subject: [PATCH] PaRSEC now allows DSLs to free the gpu task We can allocate the GPU task inside the task structure and avoid an extra allocation. Signed-off-by: Joseph Schuchart --- .../ExternalDependenciesVersions.cmake | 2 +- ttg/ttg/parsec/task.h | 12 +- ttg/ttg/parsec/ttg.h | 128 ++++++++---------- 3 files changed, 66 insertions(+), 76 deletions(-) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index 49f66fe0a..fbc9c9955 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG d1b34157c349cf0a7c2f149b7704a682d53f6486) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058) -set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c) +set(TTG_TRACKED_PARSEC_TAG a9ab33d8287578c68c0349662352f280bc83e2c0) set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f) set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815) diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index f29ca8ecb..2b3b06d60 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -11,20 +11,18 @@ namespace ttg_parsec { namespace detail { struct device_ptr_t { - parsec_gpu_task_t* gpu_task = nullptr; + parsec_gpu_task_t *gpu_task = nullptr; parsec_flow_t* flows = nullptr; parsec_gpu_exec_stream_t* stream = nullptr; parsec_device_gpu_module_t* device = nullptr; parsec_task_class_t task_class; // copy of the taskclass }; - template + template struct device_state_t { static constexpr bool support_device = false; static constexpr size_t num_flows = 0; - device_state_t() - { } static constexpr device_ptr_t* dev_ptr() { return nullptr; } @@ -32,10 +30,12 @@ namespace ttg_parsec { template<> struct device_state_t { - static constexpr bool support_device = false; + static constexpr bool support_device = true; static constexpr size_t num_flows = MAX_PARAM_COUNT; parsec_flow_t m_flows[num_flows]; - device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr, nullptr}; // gpu_task will be allocated in each task + parsec_gpu_task_t device_task; + device_ptr_t m_dev_ptr = {&device_task, &m_flows[0], nullptr, nullptr}; + device_ptr_t* dev_ptr() { return &m_dev_ptr; } diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 47ab79068..ed87ce228 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1494,87 +1494,77 @@ namespace ttg_parsec { return rc; } + /* callback to set in the device task structure */ + static void release_device_task(void *ptr) { + /* nothing to do */ + } + template static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) { task_t *task = (task_t*)parsec_task; - if (task->dev_ptr->gpu_task == nullptr) { - - /* set up a device task */ - parsec_gpu_task_t *gpu_task; - /* PaRSEC wants to free the gpu_task, because F***K ownerships */ - gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); - PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); - gpu_task->ec = parsec_task; - gpu_task->task_type = 0; // user task - gpu_task->last_data_check_epoch = 0; // used internally - gpu_task->pushout = 0; - gpu_task->submit = &TT::device_static_submit; - - // one way to force the task device - // currently this will probably break all of PaRSEC if this hint - // does not match where the data is located, not really useful for us - // instead we set a hint on the data if there is no hint set yet - //parsec_task->selected_device = ...; - - /* set the gpu_task so it's available in register_device_memory */ - task->dev_ptr->gpu_task = gpu_task; - - /* TODO: is this the right place to set the mask? */ - task->parsec_task.chore_mask = PARSEC_DEV_ALL; - /* copy over the task class, because that's what we need */ - task->dev_ptr->task_class = *task->parsec_task.task_class; - - // first invocation of the coroutine to get the coroutine handle - static_op(parsec_task); - - /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ - - parsec_task_class_t& tc = task->dev_ptr->task_class; - - // input flows are set up during register_device_memory as part of the first invocation above - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - tc.in[i] = gpu_task->flow[i]; - tc.out[i] = gpu_task->flow[i]; + /* set up the device task */ + parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task; + /* TODO: needed? */ + std::memset(gpu_task, 0, sizeof(gpu_task)); + /* construct the GPU task */ + PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); + gpu_task->ec = parsec_task; + gpu_task->task_type = 0; // user task + gpu_task->last_data_check_epoch = 0; // used internally + gpu_task->pushout = 0; + gpu_task->submit = &TT::device_static_submit; + gpu_task->release_device_task = &release_device_task; + + /* TODO: is this the right place to set the mask? */ + task->parsec_task.chore_mask = PARSEC_DEV_ALL; + + /* copy over the task class, because that's what we need */ + task->dev_ptr->task_class = *task->parsec_task.task_class; + + // first invocation of the coroutine to get the coroutine handle + static_op(parsec_task); + + /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + + parsec_task_class_t& tc = task->dev_ptr->task_class; + + // input flows are set up during register_device_memory as part of the first invocation above + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + tc.in[i] = gpu_task->flow[i]; + tc.out[i] = gpu_task->flow[i]; + } + tc.nb_flows = MAX_PARAM_COUNT; + + /* set the device hint on the data */ + TT *tt = task->tt; + if (tt->devicemap) { + int parsec_dev; + if constexpr (std::is_void_v) { + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap()); + } else { + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key)); } - tc.nb_flows = MAX_PARAM_COUNT; - - /* set the device hint on the data */ - TT *tt = task->tt; - if (tt->devicemap) { - int parsec_dev; - if constexpr (std::is_void_v) { - parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap()); - } else { - parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key)); - } - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - /* only set on mutable data since we have exclusive access */ - if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) { - parsec_data_t *data = parsec_task->data[i].data_in->original; - /* only set the preferred device if the host has the latest copy - * as otherwise we may end up with the wrong data if there is a newer - * version on a different device. Also, keep fingers crossed. */ - if (data->owner_device == 0) { - parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); - } + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + /* only set on mutable data since we have exclusive access */ + if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) { + parsec_data_t *data = parsec_task->data[i].data_in->original; + /* only set the preferred device if the host has the latest copy + * as otherwise we may end up with the wrong data if there is a newer + * version on a different device. Also, keep fingers crossed. */ + if (data->owner_device == 0) { + parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); } } } - - /* set the new task class that contains the flows */ - task->parsec_task.task_class = &task->dev_ptr->task_class; - - /* select this one */ - return PARSEC_HOOK_RETURN_DONE; } - std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl; - - /* not sure if this might happen*/ - return PARSEC_HOOK_RETURN_ERROR; + /* set the new task class that contains the flows */ + task->parsec_task.task_class = &task->dev_ptr->task_class; + /* select this one */ + return PARSEC_HOOK_RETURN_DONE; } template