diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 303312932..4ba196b5f 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -118,7 +118,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const { return HSA_STATUS_ERROR; } - aie_agent.SetNumCols(aie_metadata.cols); + // Right now can only target N-1 columns so putting this + // here as a workaround + aie_agent.SetNumCols(aie_metadata.cols - 1); aie_agent.SetNumCoreRows(aie_metadata.core.row_count); return HSA_STATUS_SUCCESS; @@ -351,6 +353,16 @@ hsa_status_t XdnaDriver::InitDeviceHeap() { return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map &vmem_handle_mappings) { + vmem_handle_mappings = this->vmem_handle_mappings; + return HSA_STATUS_SUCCESS; +} + +hsa_status_t XdnaDriver::GetFd(int &fd) { + fd = fd_; + return HSA_STATUS_SUCCESS; +} + hsa_status_t XdnaDriver::FreeDeviceHeap() { if (dev_heap_parent) { munmap(dev_heap_parent, dev_heap_align * 2 - 1); @@ -388,6 +400,13 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU( config_cu_param.cu_configs[i].cu_config_bo; xdna_config_cu_param->cu_configs[i].cu_func = config_cu_param.cu_configs[i].cu_func; + + // sync configuration buffer + amdxdna_drm_sync_bo sync_args = {}; + sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo; + if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) { + return HSA_STATUS_ERROR; + } } amdxdna_drm_config_hwctx config_hw_ctx_args{ diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 70f05e28a..b2cf67f63 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -51,6 +51,35 @@ #include "core/inc/signal.h" #include "core/util/locks.h" +/* + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles. + */ +struct amdxdna_cmd_chain { + __u32 command_count; + __u32 submit_index; + __u32 error_index; + __u32 reserved[3]; + __u64 data[] __counted_by(command_count); +}; + + +/* Exec buffer command header format */ +struct amdxdna_cmd { + union { + struct { + __u32 state : 4; + __u32 unused : 6; + __u32 extra_cu_masks : 2; + __u32 count : 11; + __u32 opcode : 5; + __u32 reserved : 4; + }; + __u32 header; + }; + __u32 data[] __counted_by(count); +}; + namespace rocr { namespace AMD { @@ -134,6 +163,39 @@ class AieAqlQueue : public core::Queue, /// @brief Base of the queue's ring buffer storage. void *ring_buf_ = nullptr; + hsa_status_t AieAqlQueue::SubmitCmd(uint32_t hw_ctx_handle, int fd, void *queue_base, + uint64_t read_dispatch_id, uint64_t write_dispatch_id, + std::unordered_map &vmem_handle_mappings); + + /// @brief Creates a command BO and returns a pointer to the memory and + // the corresponding handle + /// + /// @param size size of memory to allocate + /// @param handle A pointer to the BO handle + /// @param cmd A pointer to the buffer + hsa_status_t CreateCmd(uint32_t size, uint32_t *handle, amdxdna_cmd **cmd, int fd); + + /// @brief Adds all BOs in a command packet payload to a vector + /// and replaces the handles with a virtual address + /// + /// @param count Number of entries in the command + /// @param bo_args A pointer to a vector that contains all bo handles + /// @param cmd_pkt_payload A pointer to the payload of the command + void RegisterCmdBOs(uint32_t count, std::vector &bo_args, + hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, + std::unordered_map &vmem_handle_mappings); + + /// @brief Syncs all BOs referenced in bo_args + /// + /// @param bo_args vector containing handles of BOs to sync + hsa_status_t SyncBos(std::vector &bo_args, int fd); + + /// @brief Executes a command and waits for its completion + /// + /// @param exec_cmd Structure containing the details of the command to execute + /// @param hw_ctx_handle the handle of the hardware context to run this command + hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, uint32_t hw_ctx_handle, int fd); + /// @brief Handle for an application context on the AIE device. /// /// Each user queue will have an associated context. This handle is assigned diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 28572e135..2f5a6b73a 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -47,6 +47,7 @@ #include "core/inc/driver.h" #include "core/inc/memory_region.h" +#include "core/driver/xdna/uapi/amdxdna_accel.h" namespace rocr { namespace core { @@ -69,6 +70,9 @@ class XdnaDriver : public core::Driver { hsa_status_t Init() override; hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; + hsa_status_t GetHandleMappings(std::unordered_map &vmem_handle_mappings); + hsa_status_t GetFd(int &fd); + hsa_status_t GetAgentProperties(core::Agent &agent) const override; hsa_status_t GetMemoryProperties(uint32_t node_id, diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index e8562f226..8cd409d3c 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -41,6 +41,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "core/inc/amd_aie_aql_queue.h" +#include "core/inc/amd_xdna_driver.h" #ifdef __linux__ #include @@ -48,6 +49,8 @@ #include #include #include +#include +#include #endif #ifdef _WIN32 @@ -195,8 +198,230 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) { } void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { - atomic::Store(signal_.hardware_doorbell_ptr, uint64_t(value), - std::memory_order_release); + std::unordered_map vmem_handle_mappings; + if(static_cast(core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)).GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) + return; + + int fd = 0; + if(static_cast(core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)).GetFd(fd) != HSA_STATUS_SUCCESS) + return; + + SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id, vmem_handle_mappings); +} + +hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, int fd) { + for (int i = 0 ; i < bo_args.size(); i++) { + amdxdna_drm_sync_bo sync_params = {}; + sync_params.handle = bo_args[i]; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) + return HSA_STATUS_ERROR; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, uint32_t hw_ctx_handle, int fd) { + // Submit the cmd + if (ioctl(fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd)) + return HSA_STATUS_ERROR; + + // Waiting for command to finish + amdxdna_drm_wait_cmd wait_cmd = {}; + wait_cmd.hwctx = hw_ctx_handle; + wait_cmd.timeout = 50; // 50ms timeout + wait_cmd.seq = exec_cmd->seq; + + if (ioctl(fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd)) + return HSA_STATUS_ERROR; + + return HSA_STATUS_SUCCESS; +} + +void AieAqlQueue::RegisterCmdBOs(uint32_t count, std::vector &bo_args, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, std::unordered_map &vmem_handle_mappings) { + + // This is the index where the operand addresses start in a command + const int operand_starting_index = 5; + + // We have 6 arguments of the packet before we start passing operands + // and operands are 64-bits so we need to divide by two + constexpr int non_operand_count = 6; + uint32_t num_operands = (count - non_operand_count) / 2; + + // Keep track of the handles before we submit the packet + bo_args.push_back(cmd_pkt_payload->data[2]); // we know element 2 is the instruction sequence + + + // Going through all of the operands in the command, keeping track of the + // handles and turning the handles into addresses. The starting index of + // the operands in a command is `operand_starting_index` and the fields + // are 32-bits we need to iterate over every two + for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) { + bo_args.push_back(cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]); + cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter + 1 ] = ((uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] >> 32) & 0xFFFFFFFF; + cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter ] = (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] & 0xFFFFFFFF; + } + + // We know data[2] is the DPU + cmd_pkt_payload->data[2] = 0x04000000 | (reinterpret_cast(vmem_handle_mappings[cmd_pkt_payload->data[2]]) & 0x02FFFFFF); + + return; +} + +hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, amdxdna_cmd **cmd, int fd) { + + // Creating the command + amdxdna_drm_create_bo create_cmd_bo = {}; + create_cmd_bo.type = AMDXDNA_BO_CMD, + create_cmd_bo.size = 64; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo)) + return HSA_STATUS_ERROR; + + amdxdna_drm_get_bo_info cmd_bo_get_bo_info = {}; + cmd_bo_get_bo_info.handle = create_cmd_bo.handle; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_get_bo_info)) + return HSA_STATUS_ERROR; + + *cmd = static_cast(mmap(0, 64, PROT_READ | PROT_WRITE, MAP_SHARED, fd, cmd_bo_get_bo_info.map_offset)); + *handle = create_cmd_bo.handle; + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t AieAqlQueue::SubmitCmd(uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, std::unordered_map &vmem_handle_mappings) { + + // This is the index where the operand addresses start in a command + const int operand_starting_index = 5; + + uint64_t cur_id = read_dispatch_id; + while (cur_id < write_dispatch_id) { + + hsa_amd_aie_ert_packet_t *pkt = static_cast(queue_base) + cur_id; + + // Get the packet header information + if (pkt->header.header != HSA_PACKET_TYPE_VENDOR_SPECIFIC || pkt->header.AmdFormat != HSA_AMD_PACKET_TYPE_AIE_ERT) + return HSA_STATUS_ERROR; + + // Get the payload information + switch (pkt->opcode) { + case HSA_AMD_AIE_ERT_START_CU: { + + std::vector bo_args; + std::vector cmd_handles; + + // Iterating over future packets and seeing how many contigous HSA_AMD_AIE_ERT_START_CU + // packets there are. All can be combined into a single chain. + int num_cont_start_cu_pkts = 1; + for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) { + hsa_amd_aie_ert_packet_t *peak_pkt = static_cast(queue_base) + peak_pkt_id; + if (pkt->opcode == HSA_AMD_AIE_ERT_START_CU) { + num_cont_start_cu_pkts++; + } + else { + break; + } + } + + // Iterating over all of the contigous HSA_AMD_AIE_ERT_CMD_CHAIN packets + for (int pkt_iter = cur_id; pkt_iter < cur_id + num_cont_start_cu_pkts; pkt_iter++) { + + // Getting the current command packet + hsa_amd_aie_ert_packet_t *pkt = static_cast(queue_base) + pkt_iter; + hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload = reinterpret_cast(pkt->payload_data); + + // Add the handles for all of the BOs to bo_args as well as rewrite the command + // payload handles to contain the actual virtual addresses + RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_handle_mappings); + + // Creating a packet that contains the command to execute the kernel + uint32_t cmd_bo_handle = 0; + amdxdna_cmd *cmd = nullptr; + if (CreateCmd(64, &cmd_bo_handle, &cmd, fd)) + return HSA_STATUS_ERROR; + + // Filling in the fields of the command + cmd->state = pkt->state; + cmd->extra_cu_masks = 0; + + // For some reason the first count needs to be a little larger than + // it actually is, assuming there is some other data structure at the + // beginning + // TODO: Look more into this + if (pkt_iter == cur_id) { + cmd->count = pkt->count + 5; + } + else { + cmd->count = pkt->count; + } + cmd->opcode = pkt->opcode; + cmd->data[0] = cmd_pkt_payload->cu_mask; + memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count); + + // Keeping track of the handle + cmd_handles.push_back(cmd_bo_handle); + } + + // Creating a packet that contains the command chain + uint32_t cmd_chain_bo_handle = 0; + amdxdna_cmd *cmd_chain = nullptr; + if (CreateCmd(4096, &cmd_chain_bo_handle, &cmd_chain, fd)) + return HSA_STATUS_ERROR; + + // Writing information to the command buffer + amdxdna_cmd_chain *cmd_chain_payload = reinterpret_cast(cmd_chain->data); + + // Creating a command chain + cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW; + cmd_chain->extra_cu_masks = 0; + cmd_chain->count = 0xA; // TODO: Figure out why this is the value + cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN; + cmd_chain_payload->command_count = cmd_handles.size(); + cmd_chain_payload->submit_index = 0; + cmd_chain_payload->error_index = 0; + for (int i = 0; i < cmd_handles.size(); i++) { + cmd_chain_payload->data[i] = cmd_handles[i]; + } + + // Syncing BOs before we execute the command + if (SyncBos(bo_args, fd)) + return HSA_STATUS_ERROR; + + // Removing duplicates in the bo container. The driver will report + // an error if we provide the same BO handle multiple times. + // This can happen if any of the BOs are the same across jobs + std::sort(bo_args.begin(), bo_args.end()); + bo_args.erase(std::unique(bo_args.begin(), bo_args.end()), bo_args.end()); + + // Filling in the fields to execute the command chain + amdxdna_drm_exec_cmd exec_cmd_0 = {}; + exec_cmd_0.ext = 0; + exec_cmd_0.ext_flags = 0; + exec_cmd_0.hwctx = hw_ctx_handle; + exec_cmd_0.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF; + exec_cmd_0.cmd_handles = cmd_chain_bo_handle; + exec_cmd_0.args = (__u64)bo_args.data(); + exec_cmd_0.cmd_count = 1; + exec_cmd_0.arg_count = bo_args.size(); + + // Executing all commands in the command chain + ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd); + + // Syncing BOs after we execute the command + if (SyncBos(bo_args, fd)) + return HSA_STATUS_ERROR; + + cur_id += num_cont_start_cu_pkts; + break; + } + default: { + return HSA_STATUS_ERROR; + break; + } + + } + + } + + return HSA_STATUS_SUCCESS; } void AieAqlQueue::StoreRelease(hsa_signal_value_t value) {