Skip to content
This repository has been archived by the owner on Dec 24, 2024. It is now read-only.

Commit

Permalink
Adding soft queue dispatch logic to dispatch commands to AIE agents
Browse files Browse the repository at this point in the history
  • Loading branch information
eddierichter-amd authored and makslevental committed Sep 2, 2024
1 parent 4cb0048 commit 3dd381f
Show file tree
Hide file tree
Showing 4 changed files with 313 additions and 3 deletions.
21 changes: 20 additions & 1 deletion runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const {
return HSA_STATUS_ERROR;
}

aie_agent.SetNumCols(aie_metadata.cols);
// Right now can only target N-1 columns so putting this
// here as a workaround
aie_agent.SetNumCols(aie_metadata.cols - 1);
aie_agent.SetNumCoreRows(aie_metadata.core.row_count);

return HSA_STATUS_SUCCESS;
Expand Down Expand Up @@ -351,6 +353,16 @@ hsa_status_t XdnaDriver::InitDeviceHeap() {
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings) {
vmem_handle_mappings = this->vmem_handle_mappings;
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetFd(int &fd) {
fd = fd_;
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::FreeDeviceHeap() {
if (dev_heap_parent) {
munmap(dev_heap_parent, dev_heap_align * 2 - 1);
Expand Down Expand Up @@ -388,6 +400,13 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU(
config_cu_param.cu_configs[i].cu_config_bo;
xdna_config_cu_param->cu_configs[i].cu_func =
config_cu_param.cu_configs[i].cu_func;

// sync configuration buffer
amdxdna_drm_sync_bo sync_args = {};
sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) {
return HSA_STATUS_ERROR;
}
}

amdxdna_drm_config_hwctx config_hw_ctx_args{
Expand Down
62 changes: 62 additions & 0 deletions runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,35 @@
#include "core/inc/signal.h"
#include "core/util/locks.h"

/*
* Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
* amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
*/
struct amdxdna_cmd_chain {
__u32 command_count;
__u32 submit_index;
__u32 error_index;
__u32 reserved[3];
__u64 data[] __counted_by(command_count);
};


/* Exec buffer command header format */
struct amdxdna_cmd {
union {
struct {
__u32 state : 4;
__u32 unused : 6;
__u32 extra_cu_masks : 2;
__u32 count : 11;
__u32 opcode : 5;
__u32 reserved : 4;
};
__u32 header;
};
__u32 data[] __counted_by(count);
};

namespace rocr {
namespace AMD {

Expand Down Expand Up @@ -134,6 +163,39 @@ class AieAqlQueue : public core::Queue,
/// @brief Base of the queue's ring buffer storage.
void *ring_buf_ = nullptr;

hsa_status_t AieAqlQueue::SubmitCmd(uint32_t hw_ctx_handle, int fd, void *queue_base,

Check warning on line 166 in runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h

View workflow job for this annotation

GitHub Actions / Build (linux)

extra qualification on member 'SubmitCmd' [-Wmicrosoft-extra-qualification]
uint64_t read_dispatch_id, uint64_t write_dispatch_id,
std::unordered_map<uint32_t, void*> &vmem_handle_mappings);

/// @brief Creates a command BO and returns a pointer to the memory and
// the corresponding handle
///
/// @param size size of memory to allocate
/// @param handle A pointer to the BO handle
/// @param cmd A pointer to the buffer
hsa_status_t CreateCmd(uint32_t size, uint32_t *handle, amdxdna_cmd **cmd, int fd);

/// @brief Adds all BOs in a command packet payload to a vector
/// and replaces the handles with a virtual address
///
/// @param count Number of entries in the command
/// @param bo_args A pointer to a vector that contains all bo handles
/// @param cmd_pkt_payload A pointer to the payload of the command
void RegisterCmdBOs(uint32_t count, std::vector<uint32_t> &bo_args,
hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload,
std::unordered_map<uint32_t, void*> &vmem_handle_mappings);

/// @brief Syncs all BOs referenced in bo_args
///
/// @param bo_args vector containing handles of BOs to sync
hsa_status_t SyncBos(std::vector<uint32_t> &bo_args, int fd);

/// @brief Executes a command and waits for its completion
///
/// @param exec_cmd Structure containing the details of the command to execute
/// @param hw_ctx_handle the handle of the hardware context to run this command
hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, uint32_t hw_ctx_handle, int fd);

/// @brief Handle for an application context on the AIE device.
///
/// Each user queue will have an associated context. This handle is assigned
Expand Down
4 changes: 4 additions & 0 deletions runtime/hsa-runtime/core/inc/amd_xdna_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

#include "core/inc/driver.h"
#include "core/inc/memory_region.h"
#include "core/driver/xdna/uapi/amdxdna_accel.h"

namespace rocr {
namespace core {
Expand All @@ -69,6 +70,9 @@ class XdnaDriver : public core::Driver {
hsa_status_t Init() override;
hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;

hsa_status_t GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings);
hsa_status_t GetFd(int &fd);

hsa_status_t GetAgentProperties(core::Agent &agent) const override;
hsa_status_t
GetMemoryProperties(uint32_t node_id,
Expand Down
229 changes: 227 additions & 2 deletions runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,16 @@
////////////////////////////////////////////////////////////////////////////////

#include "core/inc/amd_aie_aql_queue.h"
#include "core/inc/amd_xdna_driver.h"

#ifdef __linux__
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#endif

#ifdef _WIN32
Expand Down Expand Up @@ -195,8 +198,230 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) {
}

void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) {
atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value),
std::memory_order_release);
std::unordered_map<uint32_t, void*> vmem_handle_mappings;
if(static_cast<XdnaDriver&>(core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)).GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS)
return;

int fd = 0;
if(static_cast<XdnaDriver&>(core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)).GetFd(fd) != HSA_STATUS_SUCCESS)
return;

SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id, vmem_handle_mappings);
}

hsa_status_t AieAqlQueue::SyncBos(std::vector<uint32_t> &bo_args, int fd) {
for (int i = 0 ; i < bo_args.size(); i++) {
amdxdna_drm_sync_bo sync_params = {};
sync_params.handle = bo_args[i];
if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params))
return HSA_STATUS_ERROR;
}

return HSA_STATUS_SUCCESS;
}

hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, uint32_t hw_ctx_handle, int fd) {
// Submit the cmd
if (ioctl(fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd))
return HSA_STATUS_ERROR;

// Waiting for command to finish
amdxdna_drm_wait_cmd wait_cmd = {};
wait_cmd.hwctx = hw_ctx_handle;
wait_cmd.timeout = 50; // 50ms timeout
wait_cmd.seq = exec_cmd->seq;

if (ioctl(fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd))
return HSA_STATUS_ERROR;

return HSA_STATUS_SUCCESS;
}

void AieAqlQueue::RegisterCmdBOs(uint32_t count, std::vector<uint32_t> &bo_args, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, std::unordered_map<uint32_t, void*> &vmem_handle_mappings) {

// This is the index where the operand addresses start in a command
const int operand_starting_index = 5;

// We have 6 arguments of the packet before we start passing operands
// and operands are 64-bits so we need to divide by two
constexpr int non_operand_count = 6;
uint32_t num_operands = (count - non_operand_count) / 2;

// Keep track of the handles before we submit the packet
bo_args.push_back(cmd_pkt_payload->data[2]); // we know element 2 is the instruction sequence


// Going through all of the operands in the command, keeping track of the
// handles and turning the handles into addresses. The starting index of
// the operands in a command is `operand_starting_index` and the fields
// are 32-bits we need to iterate over every two
for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) {
bo_args.push_back(cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]);
cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter + 1 ] = ((uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] >> 32) & 0xFFFFFFFF;
cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter ] = (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] & 0xFFFFFFFF;
}

// We know data[2] is the DPU
cmd_pkt_payload->data[2] = 0x04000000 | (reinterpret_cast<uint64_t>(vmem_handle_mappings[cmd_pkt_payload->data[2]]) & 0x02FFFFFF);

return;
}

hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, amdxdna_cmd **cmd, int fd) {

// Creating the command
amdxdna_drm_create_bo create_cmd_bo = {};
create_cmd_bo.type = AMDXDNA_BO_CMD,
create_cmd_bo.size = 64;
if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo))
return HSA_STATUS_ERROR;

amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {};
cmd_bo_get_bo_info.handle = create_cmd_bo.handle;
if (ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info))
return HSA_STATUS_ERROR;

*cmd = static_cast<amdxdna_cmd *>(mmap(0, 64, PROT_READ | PROT_WRITE, MAP_SHARED, fd, cmd_bo_get_bo_info.map_offset));
*handle = create_cmd_bo.handle;

return HSA_STATUS_SUCCESS;
}

hsa_status_t AieAqlQueue::SubmitCmd(uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, std::unordered_map<uint32_t, void*> &vmem_handle_mappings) {

// This is the index where the operand addresses start in a command
const int operand_starting_index = 5;

uint64_t cur_id = read_dispatch_id;
while (cur_id < write_dispatch_id) {

hsa_amd_aie_ert_packet_t *pkt = static_cast<hsa_amd_aie_ert_packet_t *>(queue_base) + cur_id;

// Get the packet header information
if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC || pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT)
return HSA_STATUS_ERROR;

// Get the payload information
switch (pkt->opcode) {
case HSA_AMD_AIE_ERT_START_CU: {

std::vector<uint32_t> bo_args;
std::vector<uint32_t> cmd_handles;

// Iterating over future packets and seeing how many contigous HSA_AMD_AIE_ERT_START_CU
// packets there are. All can be combined into a single chain.
int num_cont_start_cu_pkts = 1;
for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) {
hsa_amd_aie_ert_packet_t *peak_pkt = static_cast<hsa_amd_aie_ert_packet_t *>(queue_base) + peak_pkt_id;
if (pkt->opcode == HSA_AMD_AIE_ERT_START_CU) {
num_cont_start_cu_pkts++;
}
else {
break;
}
}

// Iterating over all of the contigous HSA_AMD_AIE_ERT_CMD_CHAIN packets
for (int pkt_iter = cur_id; pkt_iter < cur_id + num_cont_start_cu_pkts; pkt_iter++) {

// Getting the current command packet
hsa_amd_aie_ert_packet_t *pkt = static_cast<hsa_amd_aie_ert_packet_t *>(queue_base) + pkt_iter;
hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload = reinterpret_cast<hsa_amd_aie_ert_start_kernel_data_t *>(pkt->payload_data);

// Add the handles for all of the BOs to bo_args as well as rewrite the command
// payload handles to contain the actual virtual addresses
RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_handle_mappings);

// Creating a packet that contains the command to execute the kernel
uint32_t cmd_bo_handle = 0;
amdxdna_cmd *cmd = nullptr;
if (CreateCmd(64, &cmd_bo_handle, &cmd, fd))
return HSA_STATUS_ERROR;

// Filling in the fields of the command
cmd->state = pkt->state;
cmd->extra_cu_masks = 0;

// For some reason the first count needs to be a little larger than
// it actually is, assuming there is some other data structure at the
// beginning
// TODO: Look more into this
if (pkt_iter == cur_id) {
cmd->count = pkt->count + 5;
}
else {
cmd->count = pkt->count;
}
cmd->opcode = pkt->opcode;
cmd->data[0] = cmd_pkt_payload->cu_mask;
memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count);

// Keeping track of the handle
cmd_handles.push_back(cmd_bo_handle);
}

// Creating a packet that contains the command chain
uint32_t cmd_chain_bo_handle = 0;
amdxdna_cmd *cmd_chain = nullptr;
if (CreateCmd(4096, &cmd_chain_bo_handle, &cmd_chain, fd))
return HSA_STATUS_ERROR;

// Writing information to the command buffer
amdxdna_cmd_chain *cmd_chain_payload = reinterpret_cast<amdxdna_cmd_chain *>(cmd_chain->data);

// Creating a command chain
cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_chain->extra_cu_masks = 0;
cmd_chain->count = 0xA; // TODO: Figure out why this is the value
cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
cmd_chain_payload->command_count = cmd_handles.size();
cmd_chain_payload->submit_index = 0;
cmd_chain_payload->error_index = 0;
for (int i = 0; i < cmd_handles.size(); i++) {
cmd_chain_payload->data[i] = cmd_handles[i];
}

// Syncing BOs before we execute the command
if (SyncBos(bo_args, fd))
return HSA_STATUS_ERROR;

// Removing duplicates in the bo container. The driver will report
// an error if we provide the same BO handle multiple times.
// This can happen if any of the BOs are the same across jobs
std::sort(bo_args.begin(), bo_args.end());
bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end());

// Filling in the fields to execute the command chain
amdxdna_drm_exec_cmd exec_cmd_0 = {};
exec_cmd_0.ext = 0;
exec_cmd_0.ext_flags = 0;
exec_cmd_0.hwctx = hw_ctx_handle;
exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF;
exec_cmd_0.cmd_handles = cmd_chain_bo_handle;
exec_cmd_0.args = (__u64)bo_args.data();
exec_cmd_0.cmd_count = 1;
exec_cmd_0.arg_count = bo_args.size();

// Executing all commands in the command chain
ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd);

// Syncing BOs after we execute the command
if (SyncBos(bo_args, fd))
return HSA_STATUS_ERROR;

cur_id += num_cont_start_cu_pkts;
break;
}
default: {
return HSA_STATUS_ERROR;
break;
}

}

}

return HSA_STATUS_SUCCESS;
}

void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {
Expand Down

0 comments on commit 3dd381f

Please sign in to comment.