diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index e2b91bf44..5d54d35ac 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -111,8 +111,8 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, bin_file.read(reinterpret_cast(*buf), size); } -void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, - void **buf) { +void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, + void **buf, uint32_t &num_instr) { std::ifstream bin_file(file_name, std::ios::binary | std::ios::ate | std::ios::in); @@ -129,6 +129,7 @@ void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); assert(r == HSA_STATUS_SUCCESS); std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); + num_instr = pdi_vec.size(); } } // namespace @@ -144,9 +145,9 @@ int main(int argc, char **argv) { hsa_amd_memory_pool_t global_dev_mem_pool{0}; // System memory pool. Used for allocating kernel argument data. hsa_amd_memory_pool_t global_kernarg_mem_pool{0}; - const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt"); + const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt"); const std::string pdi_file_name(sourcePath / "add_one.pdi"); - uint32_t *dpu_inst_buf(nullptr); + uint32_t *instr_inst_buf(nullptr); uint64_t *pdi_buf(nullptr); assert(aie_agents.empty()); @@ -164,8 +165,6 @@ int main(int argc, char **argv) { // Find the AIE agents in the system. r = hsa_iterate_agents(get_aie_agents, &aie_agents); assert(r == HSA_STATUS_SUCCESS); - // assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) == - // HSA_STATUS_SUCCESS); assert(aie_agents.size() == 1); const auto &aie_agent = aie_agents.front(); @@ -190,14 +189,22 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(global_kernarg_mem_pool.handle); + // Getting the maximum size of the queue so we can submit that many consecutive + // packets. + uint32_t aie_max_queue_size; + r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size); + assert(r == HSA_STATUS_SUCCESS); + int num_pkts = aie_max_queue_size; + // Load the DPU and PDI files into a global pool that doesn't support kernel // args (DEV BO). - load_dpu_file(global_dev_mem_pool, dpu_inst_file_name, - reinterpret_cast(&dpu_inst_buf)); - uint32_t dpu_handle = 0; - r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle); + uint32_t num_instr; + load_instr_file(global_dev_mem_pool, instr_inst_file_name, + reinterpret_cast(&instr_inst_buf), num_instr); + uint32_t instr_handle = 0; + r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle); assert(r == HSA_STATUS_SUCCESS); - assert(dpu_handle != 0); + assert(instr_handle != 0); load_pdi_file(global_dev_mem_pool, pdi_file_name, reinterpret_cast(&pdi_buf)); @@ -222,85 +229,99 @@ int main(int argc, char **argv) { constexpr std::size_t data_buffer_size = num_data_elements * sizeof(std::uint32_t); - std::uint32_t *input = {}; - r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, - reinterpret_cast(&input)); - assert(r == HSA_STATUS_SUCCESS); - std::uint32_t input_handle = {}; - r = hsa_amd_get_handle_from_vaddr(input, &input_handle); - assert(r == HSA_STATUS_SUCCESS); - assert(input_handle != 0); - - std::uint32_t *output = {}; - r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, - reinterpret_cast(&output)); - assert(r == HSA_STATUS_SUCCESS); - std::uint32_t output_handle = {}; - r = hsa_amd_get_handle_from_vaddr(output, &output_handle); - assert(r == HSA_STATUS_SUCCESS); - assert(output_handle != 0); + std::vector input(num_pkts); + std::vector output(num_pkts); + std::vector cmd_payloads(num_pkts); + std::vector input_handle(num_pkts); + std::vector output_handle(num_pkts); + + uint64_t wr_idx = 0; + uint64_t packet_id = 0; + + for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, + reinterpret_cast(&input[pkt_iter])); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); + assert(input_handle[pkt_iter] != 0); + + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, + reinterpret_cast(&output[pkt_iter])); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); + assert(output_handle[pkt_iter] != 0); + + for (std::size_t i = 0; i < num_data_elements; i++) { + *(input[pkt_iter] + i) = i * (pkt_iter + 1); + *(output[pkt_iter] + i) = 0xDEFACE; + } - for (std::size_t i = 0; i < num_data_elements; i++) { - *(input + i) = i; - *(output + i) = 0xDEFACE; + // Getting a slot in the queue + wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1); + packet_id = wr_idx % aie_queue->size; + + // Creating a packet to store the command + hsa_amd_aie_ert_packet_t *cmd_pkt = static_cast( + aie_queue->base_address) + packet_id; + assert(r == HSA_STATUS_SUCCESS); + cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW; + cmd_pkt->count = 0xA; // # of arguments to put in command + cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU; + cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT; + cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC + << HSA_PACKET_HEADER_TYPE; + + // Creating the payload for the packet + hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL; + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, + reinterpret_cast(&cmd_payload)); + assert(r == HSA_STATUS_SUCCESS); + // Selecting the PDI to use with this command + cmd_payload->cu_mask = 0x1; + // Transaction opcode + cmd_payload->data[0] = 0x3; + cmd_payload->data[1] = 0x0; + cmd_payload->data[2] = instr_handle; + cmd_payload->data[3] = 0x0; + cmd_payload->data[4] = num_instr; + cmd_payload->data[5] = input_handle[pkt_iter]; + cmd_payload->data[6] = 0; + cmd_payload->data[7] = output_handle[pkt_iter]; + cmd_payload->data[8] = 0; + cmd_pkt->payload_data = reinterpret_cast(cmd_payload); + + // Keeping track of payloads so we can free them at the end + cmd_payloads[pkt_iter] = cmd_payload; } - ///////////////////////////////////// Creating the cmd packet - // Creating a packet to store the command - hsa_amd_aie_ert_packet_t *cmd_pkt = NULL; - r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, - reinterpret_cast(&cmd_pkt)); - assert(r == HSA_STATUS_SUCCESS); - cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW; - cmd_pkt->count = 0xA; // # of arguments to put in command - cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU; - cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT; - cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC - << HSA_PACKET_HEADER_TYPE; - - // Creating the payload for the packet - hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL; - uint32_t cmd_handle; - r = hsa_amd_get_handle_from_vaddr(reinterpret_cast(cmd_pkt), - &cmd_handle); - assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0, - reinterpret_cast(&cmd_payload)); - assert(r == HSA_STATUS_SUCCESS); - cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command - cmd_payload->data[0] = 0x3; // Transaction opcode - cmd_payload->data[1] = 0x0; - cmd_payload->data[2] = dpu_handle; - cmd_payload->data[3] = 0x0; - cmd_payload->data[4] = 0x44; // Size of DPU instruction - cmd_payload->data[5] = input_handle; - cmd_payload->data[6] = 0; - cmd_payload->data[7] = output_handle; - cmd_payload->data[8] = 0; - cmd_pkt->payload_data = reinterpret_cast(cmd_payload); - - uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1); - uint64_t packet_id = wr_idx % aie_queue->size; - reinterpret_cast( - aie_queue->base_address)[packet_id] = *cmd_pkt; + // Ringing the doorbell to dispatch each packet we added to + // the queue hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx); - for (std::size_t i = 0; i < num_data_elements; i++) { - const auto expected = *(input + i) + 1; - const auto result = *(output + i); - assert(result == expected); + for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { + for (std::size_t i = 0; i < num_data_elements; i++) { + const auto expected = *(input[pkt_iter] + i) + 1; + const auto result = *(output[pkt_iter] + i); + assert(result == expected); + } + + r = hsa_amd_memory_pool_free(output[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(input[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); + r = hsa_amd_memory_pool_free(cmd_payloads[pkt_iter]); + assert(r == HSA_STATUS_SUCCESS); } r = hsa_queue_destroy(aie_queue); assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_memory_pool_free(output); - assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_memory_pool_free(input); - assert(r == HSA_STATUS_SUCCESS); r = hsa_amd_memory_pool_free(pdi_buf); assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_memory_pool_free(dpu_inst_buf); + r = hsa_amd_memory_pool_free(instr_inst_buf); assert(r == HSA_STATUS_SUCCESS); r = hsa_shut_down(); diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index b2f8fd2d0..50229daf4 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -68,14 +68,13 @@ constexpr int NON_OPERAND_COUNT = 6; constexpr int DEV_ADDR_BASE = 0x04000000; constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF; -// BO size allocated for commands -constexpr int CMD_SIZE = 64; - -// This is a temp workaround. For some reason the first command count in a chain -// needs to be a larger than it actually is, assuming there is some other data -// structure at the beginning -// TODO: Look more into this -constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5; +// The driver places a structure before each command in a command chain. +// Need to increase the size of the command by the size of this structure. +// In the following xdna driver source can see where this is implemented: +// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23 +// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391 +// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637 +constexpr int CMD_COUNT_SIZE_INCREASE = 3; // Index of command payload where the instruction sequence // address is located @@ -311,7 +310,7 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, // Creating the command amdxdna_drm_create_bo create_cmd_bo = {}; create_cmd_bo.type = AMDXDNA_BO_CMD, - create_cmd_bo.size = CMD_SIZE; + create_cmd_bo.size = size; if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo)) return HSA_STATUS_ERROR; @@ -345,7 +344,6 @@ hsa_status_t AieAqlQueue::SubmitCmd( // Get the payload information switch (pkt->opcode) { case HSA_AMD_AIE_ERT_START_CU: { - std::vector bo_args; std::vector cmd_handles; @@ -376,23 +374,17 @@ hsa_status_t AieAqlQueue::SubmitCmd( // Creating a packet that contains the command to execute the kernel uint32_t cmd_bo_handle = 0; amdxdna_cmd *cmd = nullptr; - if (CreateCmd(64, &cmd_bo_handle, &cmd, fd)) + uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t); + if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd, fd)) return HSA_STATUS_ERROR; // Filling in the fields of the command cmd->state = pkt->state; cmd->extra_cu_masks = 0; - // For some reason the first count needs to be a little larger than - // it actually is, assuming there is some other data structure at the - // beginning - // TODO: Look more into this - if (pkt_iter == cur_id) { - cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE; - } - else { - cmd->count = pkt->count; - } + // The driver places a structure before each command in a command chain. + // Need to increase the size of the command by the size of this structure. + cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE; cmd->opcode = pkt->opcode; cmd->data[0] = cmd_pkt_payload->cu_mask; memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count); @@ -414,8 +406,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( // Creating a command chain cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW; cmd_chain->extra_cu_masks = 0; - // TODO: Figure out why this is the value - cmd_chain->count = 0xA; + cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t); cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN; cmd_chain_payload->command_count = cmd_handles.size(); cmd_chain_payload->submit_index = 0;