Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PaRSEC now allows DSLs to free the gpu task #307

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/modules/ExternalDependenciesVersions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
set(TTG_TRACKED_VG_CMAKE_KIT_TAG 878654d0cb1904049fbd2c37b37d5385ae897658) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
set(TTG_TRACKED_PARSEC_TAG c97e2fc54698d3d937d7847a12c7e9084b22a6c8)
set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815)

Expand Down
2 changes: 1 addition & 1 deletion cmake/modules/FindOrFetchPARSEC.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec)

FetchContent_Declare(
PARSEC
GIT_REPOSITORY https://github.com/ICLDisco/parsec.git
GIT_REPOSITORY https://github.com/bosilca/parsec.git
GIT_TAG ${TTG_TRACKED_PARSEC_TAG}
)
FetchContent_MakeAvailable(PARSEC)
Expand Down
53 changes: 20 additions & 33 deletions ttg/ttg/parsec/devicefunc.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ namespace ttg_parsec {
parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
assert(nullptr != caller->dev_ptr);
parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
parsec_flow_t *flows = caller->dev_ptr->flows;

auto& view = std::get<I>(views);
bool is_current = false;
Expand All @@ -38,15 +37,15 @@ namespace ttg_parsec {
}

/* build the flow */
/* TODO: reuse the flows of the task class? How can we control the sync direction then? */
flows[I] = parsec_flow_t{.name = nullptr,
*((parsec_flow_t*)gpu_task->flow_info[I].flow) =
parsec_flow_t{.name = nullptr,
.sym_type = PARSEC_SYM_INOUT,
.flow_flags = static_cast<uint8_t>(access),
.flow_index = I,
.flow_datatype_mask = ~0 };

gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
gpu_task->flow[I] = &flows[I];
gpu_task->flow_info[I].flow_span = data->span; // size in bytes
gpu_task->flow_info[I].flow_dc = nullptr;

/* set the input data copy, parsec will take care of the transfer
* and the buffer will look at the parsec_data_t for the current pointer */
Expand All @@ -57,13 +56,13 @@ namespace ttg_parsec {

} else {
/* ignore the flow */
flows[I] = parsec_flow_t{.name = nullptr,
*((parsec_flow_t*)gpu_task->flow_info[I].flow) =
parsec_flow_t{.name = nullptr,
.sym_type = PARSEC_FLOW_ACCESS_NONE,
.flow_flags = 0,
.flow_index = I,
.flow_datatype_mask = ~0 };
gpu_task->flow[I] = &flows[I];
gpu_task->flow_nb_elts[I] = 0; // size in bytes
gpu_task->flow_info[I].flow_span = 0; // size in bytes
caller->parsec_task.data[I].data_in = nullptr;
}

Expand All @@ -80,6 +79,7 @@ namespace ttg_parsec {
template<typename... Views>
bool register_device_memory(std::tuple<Views&...> &views) {
bool is_current = true;
constexpr const std::size_t num_views = sizeof...(Views);
if (nullptr == detail::parsec_ttg_caller) {
throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
}
Expand All @@ -88,19 +88,13 @@ namespace ttg_parsec {
throw std::runtime_error("register_device_memory called inside a non-gpu task!");
}

auto task = detail::parsec_ttg_caller;
task->dev_ptr->gpu_task->allocate_flows(num_views);

if constexpr (sizeof...(Views) > 0) {
is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
}

/* reset all entries in the current task */
for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i];
detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
}

return is_current;
}

Expand All @@ -120,8 +114,8 @@ namespace ttg_parsec {
uint8_t i; // only limited number of flows
detail::parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
assert(nullptr != caller->dev_ptr);
caller->dev_ptr->gpu_task->allocate_flows(span.size());
parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
parsec_flow_t *flows = caller->dev_ptr->flows;

bool is_current = false;
for (i = 0; i < span.size(); ++i) {
Expand All @@ -146,14 +140,15 @@ namespace ttg_parsec {

/* build the flow */
/* TODO: reuse the flows of the task class? How can we control the sync direction then? */
flows[i] = parsec_flow_t{.name = nullptr,
*((parsec_flow_t*)gpu_task->flow_info[i].flow) =
parsec_flow_t{.name = nullptr,
.sym_type = PARSEC_SYM_INOUT,
.flow_flags = static_cast<uint8_t>(access),
.flow_index = i,
.flow_datatype_mask = ~0 };

gpu_task->flow_nb_elts[i] = data->nb_elts; // size in bytes
gpu_task->flow[i] = &flows[i];
gpu_task->flow_info[i].flow_span = data->span; // size in bytes
gpu_task->flow_info[i].flow_dc = nullptr;

/* set the input data copy, parsec will take care of the transfer
* and the buffer will look at the parsec_data_t for the current pointer */
Expand All @@ -164,25 +159,17 @@ namespace ttg_parsec {

} else {
/* ignore the flow */
flows[i] = parsec_flow_t{.name = nullptr,
*((parsec_flow_t*)gpu_task->flow_info[i].flow) =
parsec_flow_t{.name = nullptr,
.sym_type = PARSEC_FLOW_ACCESS_NONE,
.flow_flags = 0,
.flow_index = i,
.flow_datatype_mask = ~0 };
gpu_task->flow[i] = &flows[i];
gpu_task->flow_nb_elts[i] = 0; // size in bytes
gpu_task->flow_info[i].flow_span = 0; // size in bytes
caller->parsec_task.data[i].data_in = nullptr;
}
}

/* reset all remaining entries in the current task */
for (; i < MAX_PARAM_COUNT; ++i) {
detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i];
detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
}
// we cannot allow the calling thread to submit kernels so say we're not ready
return is_current;
}
Expand All @@ -204,7 +191,7 @@ namespace ttg_parsec {
int ret = device_module->memcpy_async(device_module, stream,
data->device_copies[0]->device_private,
data->device_copies[data->owner_device]->device_private,
data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
data->span, parsec_device_gpu_transfer_direction_d2h);
assert(ret == PARSEC_SUCCESS);
}
if constexpr (sizeof...(Is) > 0) {
Expand Down
17 changes: 1 addition & 16 deletions ttg/ttg/parsec/devicescratch.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,6 @@ struct devicescratch {
return data;
}

void remove_from_flow() {
/* remove the scratch from the gpu-task flow */
assert(nullptr != detail::parsec_ttg_caller);
parsec_task_t *parsec_task = &detail::parsec_ttg_caller->parsec_task;
parsec_flow_t *flows = detail::parsec_ttg_caller->dev_ptr->flows;
for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
if (nullptr != parsec_task->data[i].data_in && parsec_task->data[i].data_in->original == m_data) {
flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; // disable this flow
break;
}
}
}

friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::devicescratch<T>&);

public:
Expand Down Expand Up @@ -93,8 +80,6 @@ struct devicescratch {
devicescratch& operator=(const devicescratch& db) = delete;

~devicescratch() {
/* remove data from flow */
//remove_from_flow();
if (nullptr != m_data) {
//parsec_data_destroy(m_data);
//parsec_data_copy_detach(m_data, parsec_data_get_copy(m_data, 0), 0);
Expand Down Expand Up @@ -128,7 +113,7 @@ struct devicescratch {
}

std::size_t size() const {
return (m_data->nb_elts / sizeof(element_type));
return (m_data->span / sizeof(element_type));
}

};
Expand Down
42 changes: 31 additions & 11 deletions ttg/ttg/parsec/task.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,52 @@ namespace ttg_parsec {

namespace detail {

struct parsec_ttg_gpu_task_t : public parsec_gpu_task_t {
std::byte *memory = nullptr;
void allocate_flows(std::size_t size) {
if (this->memory != nullptr) free_flows();
constexpr const auto align = std::align_val_t(std::max(alignof(parsec_flow_t), alignof(parsec_gpu_flow_info_t)));
this->memory = new(align) std::byte[size * (sizeof(parsec_flow_t) + sizeof(parsec_gpu_flow_info_s))];
parsec_flow_t *flows = (parsec_flow_t*)this->memory;
this->flow_info = (parsec_gpu_flow_info_t*)(this->memory + size * sizeof(parsec_flow_t));
for (std::size_t i = 0; i < size; ++i) {
this->flow_info[i].flow = &flows[i];
flows[i].flow_index = i;
flows[i].flow_flags = 0;
flows[i].flow_datatype_mask = ~0;
}
this->nb_flows = size;
}

void free_flows() {
if (this->memory != nullptr) {
delete[] this->memory;
this->memory = nullptr;
}
}
};

struct device_ptr_t {
parsec_gpu_task_t* gpu_task = nullptr;
parsec_flow_t* flows = nullptr;
parsec_ttg_gpu_task_t *gpu_task = nullptr;
parsec_gpu_exec_stream_t* stream = nullptr;
parsec_device_gpu_module_t* device = nullptr;
parsec_task_class_t task_class; // copy of the taskclass
};

template<bool SupportDevice>
template<bool HasDeviceOp>
struct device_state_t
{
static constexpr bool support_device = false;
static constexpr size_t num_flows = 0;
device_state_t()
{ }
static constexpr device_ptr_t* dev_ptr() {
return nullptr;
}
};

template<>
struct device_state_t<true> {
static constexpr bool support_device = false;
static constexpr size_t num_flows = MAX_PARAM_COUNT;
parsec_flow_t m_flows[num_flows];
device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr, nullptr}; // gpu_task will be allocated in each task
static constexpr bool support_device = true;
parsec_ttg_gpu_task_t device_task;
device_ptr_t m_dev_ptr = {&device_task, nullptr, nullptr};

device_ptr_t* dev_ptr() {
return &m_dev_ptr;
}
Expand Down
Loading
Loading