Skip to content

Commit

Permalink
PaRSEC now allows DSLs to free the gpu task
Browse files Browse the repository at this point in the history
We can allocate the GPU task inside the task structure and avoid
an extra allocation.

Signed-off-by: Joseph Schuchart <[email protected]>
  • Loading branch information
devreal committed Dec 23, 2024
1 parent b180cac commit 2c1323a
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 76 deletions.
2 changes: 1 addition & 1 deletion cmake/modules/ExternalDependenciesVersions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
set(TTG_TRACKED_VG_CMAKE_KIT_TAG d1b34157c349cf0a7c2f149b7704a682d53f6486) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
set(TTG_TRACKED_PARSEC_TAG a9ab33d8287578c68c0349662352f280bc83e2c0)
set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815)

Expand Down
12 changes: 6 additions & 6 deletions ttg/ttg/parsec/task.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,31 @@ namespace ttg_parsec {
namespace detail {

struct device_ptr_t {
parsec_gpu_task_t* gpu_task = nullptr;
parsec_gpu_task_t *gpu_task = nullptr;
parsec_flow_t* flows = nullptr;
parsec_gpu_exec_stream_t* stream = nullptr;
parsec_device_gpu_module_t* device = nullptr;
parsec_task_class_t task_class; // copy of the taskclass
};

template<bool SupportDevice>
template<bool HasDeviceOp>
struct device_state_t
{
static constexpr bool support_device = false;
static constexpr size_t num_flows = 0;
device_state_t()
{ }
static constexpr device_ptr_t* dev_ptr() {
return nullptr;
}
};

template<>
struct device_state_t<true> {
static constexpr bool support_device = false;
static constexpr bool support_device = true;
static constexpr size_t num_flows = MAX_PARAM_COUNT;
parsec_flow_t m_flows[num_flows];
device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr, nullptr}; // gpu_task will be allocated in each task
parsec_gpu_task_t device_task;
device_ptr_t m_dev_ptr = {&device_task, &m_flows[0], nullptr, nullptr};

device_ptr_t* dev_ptr() {
return &m_dev_ptr;
}
Expand Down
128 changes: 59 additions & 69 deletions ttg/ttg/parsec/ttg.h
Original file line number Diff line number Diff line change
Expand Up @@ -1494,87 +1494,77 @@ namespace ttg_parsec {
return rc;
}

/* callback to set in the device task structure */
static void release_device_task(void *ptr) {
/* nothing to do */
}

template <ttg::ExecutionSpace Space>
static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) {

task_t *task = (task_t*)parsec_task;
if (task->dev_ptr->gpu_task == nullptr) {

/* set up a device task */
parsec_gpu_task_t *gpu_task;
/* PaRSEC wants to free the gpu_task, because F***K ownerships */
gpu_task = static_cast<parsec_gpu_task_t*>(std::calloc(1, sizeof(*gpu_task)));
PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
gpu_task->ec = parsec_task;
gpu_task->task_type = 0; // user task
gpu_task->last_data_check_epoch = 0; // used internally
gpu_task->pushout = 0;
gpu_task->submit = &TT::device_static_submit<Space>;

// one way to force the task device
// currently this will probably break all of PaRSEC if this hint
// does not match where the data is located, not really useful for us
// instead we set a hint on the data if there is no hint set yet
//parsec_task->selected_device = ...;

/* set the gpu_task so it's available in register_device_memory */
task->dev_ptr->gpu_task = gpu_task;

/* TODO: is this the right place to set the mask? */
task->parsec_task.chore_mask = PARSEC_DEV_ALL;

/* copy over the task class, because that's what we need */
task->dev_ptr->task_class = *task->parsec_task.task_class;

// first invocation of the coroutine to get the coroutine handle
static_op<Space>(parsec_task);

/* when we come back here, the flows in gpu_task are set (see register_device_memory) */

parsec_task_class_t& tc = task->dev_ptr->task_class;

// input flows are set up during register_device_memory as part of the first invocation above
for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
tc.in[i] = gpu_task->flow[i];
tc.out[i] = gpu_task->flow[i];
/* set up the device task */
parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task;
/* TODO: needed? */
std::memset(gpu_task, 0, sizeof(*gpu_task));
/* construct the GPU task */
PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
gpu_task->ec = parsec_task;
gpu_task->task_type = 0; // user task
gpu_task->last_data_check_epoch = 0; // used internally
gpu_task->pushout = 0;
gpu_task->submit = &TT::device_static_submit<Space>;
gpu_task->release_device_task = &release_device_task;

/* TODO: is this the right place to set the mask? */
task->parsec_task.chore_mask = PARSEC_DEV_ALL;

/* copy over the task class, because that's what we need */
task->dev_ptr->task_class = *task->parsec_task.task_class;

// first invocation of the coroutine to get the coroutine handle
static_op<Space>(parsec_task);

/* when we come back here, the flows in gpu_task are set (see register_device_memory) */

parsec_task_class_t& tc = task->dev_ptr->task_class;

// input flows are set up during register_device_memory as part of the first invocation above
for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
tc.in[i] = gpu_task->flow[i];
tc.out[i] = gpu_task->flow[i];
}
tc.nb_flows = MAX_PARAM_COUNT;

/* set the device hint on the data */
TT *tt = task->tt;
if (tt->devicemap) {
int parsec_dev;
if constexpr (std::is_void_v<keyT>) {
parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap());
} else {
parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key));
}
tc.nb_flows = MAX_PARAM_COUNT;

/* set the device hint on the data */
TT *tt = task->tt;
if (tt->devicemap) {
int parsec_dev;
if constexpr (std::is_void_v<keyT>) {
parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap());
} else {
parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key));
}
for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
/* only set on mutable data since we have exclusive access */
if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) {
parsec_data_t *data = parsec_task->data[i].data_in->original;
/* only set the preferred device if the host has the latest copy
* as otherwise we may end up with the wrong data if there is a newer
* version on a different device. Also, keep fingers crossed. */
if (data->owner_device == 0) {
parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
}
for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
/* only set on mutable data since we have exclusive access */
if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) {
parsec_data_t *data = parsec_task->data[i].data_in->original;
/* only set the preferred device if the host has the latest copy
* as otherwise we may end up with the wrong data if there is a newer
* version on a different device. Also, keep fingers crossed. */
if (data->owner_device == 0) {
parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
}
}
}

/* set the new task class that contains the flows */
task->parsec_task.task_class = &task->dev_ptr->task_class;

/* select this one */
return PARSEC_HOOK_RETURN_DONE;
}

std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl;

/* not sure if this might happen*/
return PARSEC_HOOK_RETURN_ERROR;
/* set the new task class that contains the flows */
task->parsec_task.task_class = &task->dev_ptr->task_class;

/* select this one */
return PARSEC_HOOK_RETURN_DONE;
}

template <ttg::ExecutionSpace Space>
Expand Down

0 comments on commit 2c1323a

Please sign in to comment.