diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index 5d54d35ac..08434bd21 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -113,23 +113,38 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, void **buf, uint32_t &num_instr) { + + + std::cout << __LINE__ << std::endl; std::ifstream bin_file(file_name, std::ios::binary | std::ios::ate | std::ios::in); assert(bin_file.fail() == false); + std::cout << __LINE__ << std::endl; + auto size(bin_file.tellg()); bin_file.seekg(0, std::ios::beg); std::vector pdi_vec; std::string val; + + std::cout << __LINE__ << std::endl; while (bin_file >> val) { pdi_vec.push_back(std::stoul(val, nullptr, 16)); } + + std::cout << __LINE__ << std::endl; + std::cout << "Buff is at " << buf << std::endl; auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); + std::cout << __LINE__ << std::endl; assert(r == HSA_STATUS_SUCCESS); + std::cout << "*buf is at " << *buf << std::endl; std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t)); + std::cout << __LINE__ << std::endl; num_instr = pdi_vec.size(); + + std::cout << __LINE__ << std::endl; } } // namespace @@ -145,11 +160,13 @@ int main(int argc, char **argv) { hsa_amd_memory_pool_t global_dev_mem_pool{0}; // System memory pool. Used for allocating kernel argument data. hsa_amd_memory_pool_t global_kernarg_mem_pool{0}; + std::cout << __LINE__ << std::endl; const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt"); const std::string pdi_file_name(sourcePath / "add_one.pdi"); uint32_t *instr_inst_buf(nullptr); uint64_t *pdi_buf(nullptr); + std::cout << __LINE__ << std::endl; assert(aie_agents.empty()); assert(global_dev_mem_pool.handle == 0); assert(global_kernarg_mem_pool.handle == 0); @@ -158,6 +175,7 @@ int main(int argc, char **argv) { auto r = hsa_init(); assert(r == HSA_STATUS_SUCCESS); + std::cout << __LINE__ << std::endl; assert(sizeof(hsa_kernel_dispatch_packet_s) == sizeof(hsa_amd_aie_ert_packet_s)); @@ -167,6 +185,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(aie_agents.size() == 1); + std::cout << __LINE__ << std::endl; const auto &aie_agent = aie_agents.front(); // Create a queue on the first agent. @@ -176,12 +195,14 @@ int main(int argc, char **argv) { assert(aie_queue); assert(aie_queue->base_address); + std::cout << __LINE__ << std::endl; // Find a pool for DEV BOs. This is a global system memory pool that is // mapped to the device. Will be used for PDIs and DPU instructions. r = hsa_amd_agent_iterate_memory_pools( aie_agent, get_coarse_global_dev_mem_pool, &global_dev_mem_pool); assert(r == HSA_STATUS_SUCCESS); + std::cout << __LINE__ << std::endl; // Find a pool that supports kernel args. This is just normal system memory. // It will be used for commands and input data. r = hsa_amd_agent_iterate_memory_pools( @@ -189,6 +210,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(global_kernarg_mem_pool.handle); + std::cout << __LINE__ << std::endl; // Getting the maximum size of the queue so we can submit that many consecutive // packets. uint32_t aie_max_queue_size; @@ -196,9 +218,11 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); int num_pkts = aie_max_queue_size; + std::cout << __LINE__ << std::endl; // Load the DPU and PDI files into a global pool that doesn't support kernel // args (DEV BO). uint32_t num_instr; + std::cout << "instr_inst_buf: " << instr_inst_buf << std::endl; load_instr_file(global_dev_mem_pool, instr_inst_file_name, reinterpret_cast(&instr_inst_buf), num_instr); uint32_t instr_handle = 0; @@ -206,6 +230,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(instr_handle != 0); + std::cout << __LINE__ << std::endl; load_pdi_file(global_dev_mem_pool, pdi_file_name, reinterpret_cast(&pdi_buf)); uint32_t pdi_handle = 0; @@ -213,6 +238,7 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(pdi_handle != 0); + std::cout << __LINE__ << std::endl; hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle, .cu_func = 0}; @@ -224,6 +250,7 @@ int main(int argc, char **argv) { aie_queue, HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU, &config_cu_args); assert(r == HSA_STATUS_SUCCESS); + std::cout << __LINE__ << std::endl; // create inputs / outputs constexpr std::size_t num_data_elements = 1024; constexpr std::size_t data_buffer_size = @@ -238,6 +265,7 @@ int main(int argc, char **argv) { uint64_t wr_idx = 0; uint64_t packet_id = 0; + std::cout << __LINE__ << std::endl; for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, reinterpret_cast(&input[pkt_iter])); @@ -300,6 +328,7 @@ int main(int argc, char **argv) { // Ringing the doorbell to dispatch each packet we added to // the queue hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx); + std::cout << __LINE__ << std::endl; for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) { for (std::size_t i = 0; i < num_data_elements; i++) { @@ -319,6 +348,7 @@ int main(int argc, char **argv) { r = hsa_queue_destroy(aie_queue); assert(r == HSA_STATUS_SUCCESS); + std::cout << __LINE__ << std::endl; r = hsa_amd_memory_pool_free(pdi_buf); assert(r == HSA_STATUS_SUCCESS); r = hsa_amd_memory_pool_free(instr_inst_buf); diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 12f140e5c..fe97431be 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -136,6 +136,8 @@ hsa_status_t XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, core::MemoryRegion::AllocateFlags alloc_flags, void **mem, size_t size, uint32_t node_id) { + + std::cout << __func__ << __LINE__ << std::endl; const auto ®ion = static_cast(mem_region); amdxdna_drm_create_bo create_bo_args{.size = size}; amdxdna_drm_get_bo_info get_bo_info_args{0}; @@ -143,16 +145,20 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, void *mapped_mem(nullptr); if (!region.IsSystem()) { + std::cout << __func__ << __LINE__ << std::endl; return HSA_STATUS_ERROR_INVALID_REGION; } if (region.kernarg()) { + std::cout << __func__ << __LINE__ << std::endl; create_bo_args.type = AMDXDNA_BO_SHMEM; } else { + std::cout << __func__ << __LINE__ << std::endl; create_bo_args.type = AMDXDNA_BO_DEV; } if (ioctl(fd_, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_args) < 0) { + std::cout << __func__ << __LINE__ << std::endl; return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } @@ -160,8 +166,10 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, // In case we need to close this BO to avoid leaks due to some error after // creation. close_bo_args.handle = create_bo_args.handle; + std::cout << __func__ << __LINE__ << std::endl; if (ioctl(fd_, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info_args) < 0) { + std::cout << __func__ << __LINE__ << std::endl; // Close the BO in the case we can't get info about it. ioctl(fd_, DRM_IOCTL_GEM_CLOSE, &close_bo_args); return HSA_STATUS_ERROR; @@ -171,6 +179,7 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, /// to VA memory addresses. Once we can support the separate VMEM call to /// map handles we can fix this. if (region.kernarg()) { + std::cout << __func__ << __LINE__ << std::endl; mapped_mem = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, get_bo_info_args.map_offset); if (mapped_mem == MAP_FAILED) { @@ -179,16 +188,21 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } } else { + std::cout << __func__ << __LINE__ << std::endl; mapped_mem = reinterpret_cast(get_bo_info_args.vaddr); } if (alloc_flags & core::MemoryRegion::AllocateMemoryOnly) { + std::cout << __func__ << __LINE__ << std::endl; *mem = reinterpret_cast(create_bo_args.handle); } else { + std::cout << __func__ << __LINE__ << std::endl; *mem = mapped_mem; } + std::cout << __func__ << __LINE__ << std::endl; vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem); + handle_size_map.emplace(create_bo_args.handle, size); vmem_handle_mappings_reverse.emplace(mapped_mem, create_bo_args.handle); return HSA_STATUS_SUCCESS; @@ -353,11 +367,18 @@ hsa_status_t XdnaDriver::InitDeviceHeap() { return HSA_STATUS_SUCCESS; } -hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map &vmem_handle_mappings) { +hsa_status_t XdnaDriver::GetHandleMappings( + std::unordered_map &vmem_handle_mappings) { vmem_handle_mappings = this->vmem_handle_mappings; return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::GetHandleSizeMap( + std::unordered_map &handle_size_map) { + handle_size_map = this->handle_size_map; + return HSA_STATUS_SUCCESS; +} + hsa_status_t XdnaDriver::GetFd(int &fd) { fd = fd_; return HSA_STATUS_SUCCESS; diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 224b85d7c..f47a93cea 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -165,7 +165,8 @@ class AieAqlQueue : public core::Queue, static hsa_status_t SubmitCmd( uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, - std::unordered_map &vmem_handle_mappings); + std::unordered_map &vmem_handle_mappings, + std::unordered_map &handle_size_map); /// @brief Creates a command BO and returns a pointer to the memory and // the corresponding handle @@ -190,7 +191,7 @@ class AieAqlQueue : public core::Queue, /// @brief Syncs all BOs referenced in bo_args /// /// @param bo_args vector containing handles of BOs to sync - static hsa_status_t SyncBos(std::vector &bo_args, int fd); + static hsa_status_t SyncBo(int fd, uint32_t bo_arg, uint32_t direction, uint32_t size); /// @brief Executes a command and waits for its completion /// diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 79cbaa710..623f7f204 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -71,6 +71,7 @@ class XdnaDriver : public core::Driver { hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; hsa_status_t GetHandleMappings(std::unordered_map &vmem_handle_mappings); + hsa_status_t GetHandleSizeMap(std::unordered_map &handle_size_map); hsa_status_t GetFd(int &fd); hsa_status_t GetAgentProperties(core::Agent &agent) const override; @@ -118,6 +119,7 @@ class XdnaDriver : public core::Driver { // TODO: Remove this once we move to the vmem API std::unordered_map vmem_handle_mappings_reverse; + std::unordered_map handle_size_map; /// @brief Virtual address range allocated for the device heap. /// diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 6f796441a..5639a02b3 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -82,7 +82,7 @@ constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2; // Environment variable to define job submission timeout constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT"; -constexpr int DEFAULT_TIMEOUT_VAL = 50; +constexpr int DEFAULT_TIMEOUT_VAL = 0; char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR); int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr); @@ -219,12 +219,16 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) { void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { std::unordered_map vmem_handle_mappings; + std::unordered_map handle_size_map; auto &driver = static_cast( core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)); if (driver.GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) { return; } + if (driver.GetHandleSizeMap(handle_size_map) != HSA_STATUS_SUCCESS) { + return; + } int fd = 0; if (driver.GetFd(fd) != HSA_STATUS_SUCCESS) { @@ -233,17 +237,17 @@ void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id, - vmem_handle_mappings); + vmem_handle_mappings, handle_size_map); } -hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, int fd) { - for (unsigned int bo_arg : bo_args) { - amdxdna_drm_sync_bo sync_params = {}; - sync_params.handle = bo_arg; - if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) - return HSA_STATUS_ERROR; +hsa_status_t AieAqlQueue::SyncBo(int fd, uint32_t bo_arg, uint32_t direction, uint32_t size) { + amdxdna_drm_sync_bo sync_params = {}; + sync_params.handle = bo_arg; + sync_params.direction = direction; + sync_params.size = size; + if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) { + return HSA_STATUS_ERROR; } - return HSA_STATUS_SUCCESS; } @@ -330,7 +334,8 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, hsa_status_t AieAqlQueue::SubmitCmd( uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, - std::unordered_map &vmem_handle_mappings) { + std::unordered_map &vmem_handle_mappings, + std::unordered_map &handle_size_map) { uint64_t cur_id = read_dispatch_id; while (cur_id < write_dispatch_id) { hsa_amd_aie_ert_packet_t *pkt = @@ -351,9 +356,6 @@ hsa_status_t AieAqlQueue::SubmitCmd( // packets there are. All can be combined into a single chain. int num_cont_start_cu_pkts = 1; for (int peak_pkt_id = cur_id + 1; peak_pkt_id < write_dispatch_id; peak_pkt_id++) { - if (pkt->opcode != HSA_AMD_AIE_ERT_START_CU) { - break; - } num_cont_start_cu_pkts++; } @@ -416,8 +418,10 @@ hsa_status_t AieAqlQueue::SubmitCmd( } // Syncing BOs before we execute the command - if (SyncBos(bo_args, fd)) - return HSA_STATUS_ERROR; + for (auto bo_arg : bo_args) { + if (SyncBo(fd, bo_arg, SYNC_DIRECT_TO_DEVICE, handle_size_map[bo_arg])) + return HSA_STATUS_ERROR; + } // Removing duplicates in the bo container. The driver will report // an error if we provide the same BO handle multiple times. @@ -440,8 +444,12 @@ hsa_status_t AieAqlQueue::SubmitCmd( ExecCmdAndWait(&exec_cmd_0, hw_ctx_handle, fd); // Syncing BOs after we execute the command - if (SyncBos(bo_args, fd)) - return HSA_STATUS_ERROR; + for (auto bo_arg : bo_args) { + if (SyncBo(fd, bo_arg, SYNC_DIRECT_FROM_DEVICE, + handle_size_map[bo_arg])) { + return HSA_STATUS_ERROR; + } + } cur_id += num_cont_start_cu_pkts; break;