From 504406b7795fab9cf0249a3942c1bc65656efb6f Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Wed, 30 Oct 2024 15:58:41 -0600 Subject: [PATCH 1/7] This commit has two changes: * Instead of using handles in the packet and in the configure hardware context sideband, we are instead passing virtual addresses. This removes some sidebands to get the handle from a virtual address and simplfies the application. The ROCr runtime has access to both so can always do the translation. * This is modifying the ioctl interface to xdna to work with the latest driver, which includes changing the heap size, passing the size of a buffer into the sync, and some small misc changes. This commit leaves one TODO which is having the user specify the sizes of operands so we know the size. This is an intermediate step to sync the BOs in userspace to avoid the syscall. --- rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 19 +- .../core/driver/xdna/amd_xdna_driver.cpp | 32 ++- .../core/driver/xdna/uapi/amdxdna_accel.h | 187 ++++++++---------- .../hsa-runtime/core/inc/amd_aie_aql_queue.h | 4 +- .../hsa-runtime/core/inc/amd_xdna_driver.h | 11 +- .../core/runtime/amd_aie_aql_queue.cpp | 49 ++--- runtime/hsa-runtime/inc/hsa_ext_amd.h | 29 +++ 7 files changed, 175 insertions(+), 156 deletions(-) diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index 5d54d35ac..ce3c7135e 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -17,6 +17,9 @@ #include "hsa/hsa.h" #include "hsa/hsa_ext_amd.h" +#define LOW_ADDR(addr) (reinterpret_cast(addr) & 0xFFFFFFFF) +#define HIGH_ADDR(addr) (reinterpret_cast(addr) >> 32) + namespace { hsa_status_t get_agent(hsa_agent_t agent, std::vector *agents, @@ -213,10 +216,10 @@ int main(int argc, char **argv) { assert(r == HSA_STATUS_SUCCESS); assert(pdi_handle != 0); - hsa_amd_aie_ert_hw_ctx_cu_config_t cu_config{.cu_config_bo = pdi_handle, + hsa_amd_aie_ert_hw_ctx_cu_config_addr_t cu_config{.cu_config_addr = reinterpret_cast(pdi_buf), .cu_func = 0}; - hsa_amd_aie_ert_hw_ctx_config_cu_param_t config_cu_args{ + hsa_amd_aie_ert_hw_ctx_config_cu_param_addr_t config_cu_args{ .num_cus = 1, .cu_configs = &cu_config}; // Configure the queue's hardware context. @@ -284,13 +287,13 @@ int main(int argc, char **argv) { // Transaction opcode cmd_payload->data[0] = 0x3; cmd_payload->data[1] = 0x0; - cmd_payload->data[2] = instr_handle; - cmd_payload->data[3] = 0x0; + cmd_payload->data[2] = LOW_ADDR(instr_inst_buf); + cmd_payload->data[3] = HIGH_ADDR(instr_inst_buf); cmd_payload->data[4] = num_instr; - cmd_payload->data[5] = input_handle[pkt_iter]; - cmd_payload->data[6] = 0; - cmd_payload->data[7] = output_handle[pkt_iter]; - cmd_payload->data[8] = 0; + cmd_payload->data[5] = LOW_ADDR(input[pkt_iter]); + cmd_payload->data[6] = HIGH_ADDR(input[pkt_iter]); + cmd_payload->data[7] = LOW_ADDR(output[pkt_iter]); + cmd_payload->data[8] = HIGH_ADDR(output[pkt_iter]); cmd_pkt->payload_data = reinterpret_cast(cmd_payload); // Keeping track of payloads so we can free them at the end diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 12f140e5c..0437e2dec 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -189,14 +189,14 @@ XdnaDriver::AllocateMemory(const core::MemoryRegion &mem_region, } vmem_handle_mappings.emplace(create_bo_args.handle, mapped_mem); - vmem_handle_mappings_reverse.emplace(mapped_mem, create_bo_args.handle); + vmem_addr_mappings.emplace(mapped_mem, create_bo_args.handle); return HSA_STATUS_SUCCESS; } hsa_status_t XdnaDriver::FreeMemory(void* ptr, size_t size) { - auto it = vmem_handle_mappings_reverse.find(ptr); - if (it == vmem_handle_mappings_reverse.end()) + auto it = vmem_addr_mappings.find(ptr); + if (it == vmem_addr_mappings.end()) return HSA_STATUS_ERROR_INVALID_ALLOCATION; // TODO:ypapadop-amd: need to unmap memory, but we don't know if it's mapped or not as we don't have @@ -211,7 +211,7 @@ hsa_status_t XdnaDriver::FreeMemory(void* ptr, size_t size) { } vmem_handle_mappings.erase(handle); - vmem_handle_mappings_reverse.erase(it); + vmem_addr_mappings.erase(it); return HSA_STATUS_SUCCESS; } @@ -272,15 +272,15 @@ XdnaDriver::ConfigHwCtx(core::Queue &queue, case HSA_AMD_QUEUE_AIE_ERT_HW_CXT_CONFIG_CU: return ConfigHwCtxCU( queue, - *reinterpret_cast(args)); + *reinterpret_cast(args)); default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } } hsa_status_t XdnaDriver::GetHandleFromVaddr(void* ptr, uint32_t* handle) { - auto it = vmem_handle_mappings_reverse.find(ptr); - if (it == vmem_handle_mappings_reverse.end()) + auto it = vmem_addr_mappings.find(ptr); + if (it == vmem_addr_mappings.end()) return HSA_STATUS_ERROR_INVALID_ALLOCATION; *handle = it->second; return HSA_STATUS_SUCCESS; @@ -358,6 +358,11 @@ hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map & return HSA_STATUS_SUCCESS; } +hsa_status_t XdnaDriver::GetAddrMappings(std::unordered_map &vmem_handle_mappings) { + vmem_handle_mappings = this->vmem_addr_mappings; + return HSA_STATUS_SUCCESS; +} + hsa_status_t XdnaDriver::GetFd(int &fd) { fd = fd_; return HSA_STATUS_SUCCESS; @@ -379,7 +384,7 @@ hsa_status_t XdnaDriver::FreeDeviceHeap() { hsa_status_t XdnaDriver::ConfigHwCtxCU( core::Queue &queue, - hsa_amd_aie_ert_hw_ctx_config_cu_param_t &config_cu_param) { + hsa_amd_aie_ert_hw_ctx_config_cu_param_addr_t &config_cu_param) { if (!AieAqlQueue::IsType(&queue)) { return HSA_STATUS_ERROR_INVALID_QUEUE; } @@ -401,14 +406,21 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU( xdna_config_cu_param->num_cus = config_cu_param.num_cus; for (int i = 0; i < xdna_config_cu_param->num_cus; ++i) { - xdna_config_cu_param->cu_configs[i].cu_bo = - config_cu_param.cu_configs[i].cu_config_bo; + + // Get the handle from the address + auto cu_bo = vmem_addr_mappings.find(reinterpret_cast(config_cu_param.cu_configs[i].cu_config_addr)); + if (cu_bo == vmem_addr_mappings.end()) + return HSA_STATUS_ERROR_INVALID_ALLOCATION; + + xdna_config_cu_param->cu_configs[i].cu_bo = cu_bo->second; xdna_config_cu_param->cu_configs[i].cu_func = config_cu_param.cu_configs[i].cu_func; // sync configuration buffer amdxdna_drm_sync_bo sync_args = {}; sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo; + sync_args.offset = 0; + sync_args.size = 4 * 1024; // TODO: Try this with the proper size if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) { return HSA_STATUS_ERROR; } diff --git a/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h b/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h index 9182a0fd7..03e74d387 100644 --- a/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h +++ b/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h @@ -6,7 +6,7 @@ #ifndef AMDXDNA_ACCEL_H_ #define AMDXDNA_ACCEL_H_ -#include +#include #include #include @@ -14,14 +14,9 @@ extern "C" { #endif -#ifndef __counted_by -#define __counted_by(cnt) -#endif - #define AMDXDNA_DRIVER_MAJOR 1 #define AMDXDNA_DRIVER_MINOR 0 -#define AMDXDNA_INVALID_CMD_HANDLE (~0UL) #define AMDXDNA_INVALID_ADDR (~0UL) #define AMDXDNA_INVALID_CTX_HANDLE 0 #define AMDXDNA_INVALID_BO_HANDLE 0 @@ -53,8 +48,6 @@ enum amdxdna_drm_ioctl_id { DRM_AMDXDNA_WAIT_CMD, DRM_AMDXDNA_GET_INFO, DRM_AMDXDNA_SET_STATE, - DRM_AMDXDNA_SUBMIT_WAIT, - DRM_AMDXDNA_SUBMIT_SIGNAL, DRM_AMDXDNA_NUM_IOCTLS }; @@ -96,6 +89,7 @@ struct amdxdna_qos_info { * @mem_size: Size of AIE tile memory. * @umq_doorbell: Returned offset of doorbell associated with UMQ. * @handle: Returned hardware context handle. + * @pad: Structure padding. */ struct amdxdna_drm_create_hwctx { __u64 ext; @@ -108,12 +102,13 @@ struct amdxdna_drm_create_hwctx { __u32 mem_size; __u32 umq_doorbell; __u32 handle; + __u32 pad; }; /** * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. * @handle: Hardware context handle. - * @pad: MBZ. + * @pad: Structure padding. */ struct amdxdna_drm_destroy_hwctx { __u32 handle; @@ -122,22 +117,22 @@ struct amdxdna_drm_destroy_hwctx { /** * struct amdxdna_cu_config - configuration for one CU - * @cu_bo: CU configuration buffer bo handle - * @cu_func: Functional of a CU - * @pad: MBZ + * @cu_bo: CU configuration buffer bo handle. + * @cu_func: Function of a CU. + * @pad: Structure padding. */ struct amdxdna_cu_config { __u32 cu_bo; - __u8 cu_func; - __u8 pad[3]; + __u8 cu_func; + __u8 pad[3]; }; /** * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware * context - * @num_cus: Number of CUs to configure - * @pad: MBZ - * @cu_configs: Array of CU configurations of struct amdxdna_cu_config + * @num_cus: Number of CUs to configure. + * @pad: Structure padding. + * @cu_configs: Array of CU configurations of struct amdxdna_cu_config. */ struct amdxdna_hwctx_param_config_cu { __u16 num_cus; @@ -160,6 +155,7 @@ enum amdxdna_drm_config_hwctx_param { * @param_val: A structure specified by the param_type struct member. * @param_val_size: Size of the parameter buffer pointed to by the param_val. * If param_val is not a pointer, driver can ignore this. + * @pad: Structure padding. * * Note: if the param_val is a pointer pointing to a buffer, the maximum size * of the buffer is 4KiB(PAGE_SIZE). @@ -191,17 +187,16 @@ enum amdxdna_bo_type { /** * struct amdxdna_drm_create_bo - Create a buffer object. * @flags: Buffer flags. MBZ. - * @type: Buffer type. * @vaddr: User VA of buffer if applied. MBZ. * @size: Size in bytes. + * @type: Buffer type. * @handle: Returned DRM buffer object handle. */ struct amdxdna_drm_create_bo { __u64 flags; - __u32 type; - __u32 _pad; __u64 vaddr; __u64 size; + __u32 type; __u32 handle; }; @@ -210,6 +205,7 @@ struct amdxdna_drm_create_bo { * @ext: MBZ. * @ext_flags: MBZ. * @handle: DRM buffer object handle. + * @pad: Structure padding. * @map_offset: Returned DRM fake offset for mmap(). * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). * @xdna_addr: Returned XDNA device virtual address. @@ -218,7 +214,7 @@ struct amdxdna_drm_get_bo_info { __u64 ext; __u64 ext_flags; __u32 handle; - __u32 _pad; + __u32 pad; __u64 map_offset; __u64 vaddr; __u64 xdna_addr; @@ -233,7 +229,7 @@ struct amdxdna_drm_get_bo_info { */ struct amdxdna_drm_sync_bo { __u32 handle; -#define SYNC_DIRECT_TO_DEVICE 0U +#define SYNC_DIRECT_TO_DEVICE 0U #define SYNC_DIRECT_FROM_DEVICE 1U __u32 direction; __u64 offset; @@ -252,8 +248,8 @@ enum amdxdna_cmd_type { * @ext_flags: MBZ. * @hwctx: Hardware context handle. * @type: One of command type in enum amdxdna_cmd_type. - * @cmd_handles: Array of command handles or the command handle itself in case - * of just one. + * @cmd_handles: Array of command handles or the command handle itself + * in case of just one. * @args: Array of arguments for all command handles. * @cmd_count: Number of command handles in the cmd_handles array. * @arg_count: Number of arguments in the args array. @@ -279,8 +275,6 @@ struct amdxdna_drm_exec_cmd { * @seq: sequence number of the command returned by execute command. * * Wait a command specified by seq to be completed. - * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot - * to submit a new command. */ struct amdxdna_drm_wait_cmd { __u32 hwctx; @@ -290,21 +284,20 @@ struct amdxdna_drm_wait_cmd { /** * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware - * @buffer: The user space buffer that will return the AIE status - * @buffer_size: The size of the user space buffer - * @cols_filled: A bitmap of AIE columns whose data has been returned in the - * buffer. + * @buffer: The user space buffer that will return the AIE status. + * @buffer_size: The size of the user space buffer. + * @cols_filled: A bitmap of AIE columns whose data has been returned in the buffer. */ struct amdxdna_drm_query_aie_status { - __u64 buffer; /* out */ + __u64 buffer; /* out */ __u32 buffer_size; /* in */ __u32 cols_filled; /* out */ }; /** * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware - * @major: The major version number - * @minor: The minor version number + * @major: The major version number. + * @minor: The minor version number. */ struct amdxdna_drm_query_aie_version { __u32 major; /* out */ @@ -319,7 +312,7 @@ struct amdxdna_drm_query_aie_version { * @dma_channel_count: The number of dma channels. * @lock_count: The number of locks. * @event_reg_count: The number of events. - * @pad: MBZ. + * @pad: Structure padding. */ struct amdxdna_drm_query_aie_tile_metadata { __u16 row_count; @@ -331,8 +324,7 @@ struct amdxdna_drm_query_aie_tile_metadata { }; /** - * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE - * hardware + * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE hardware * @col_size: The size of a column in bytes. * @cols: The total number of columns. * @rows: The total number of rows. @@ -355,7 +347,7 @@ struct amdxdna_drm_query_aie_metadata { * struct amdxdna_drm_query_clock - Metadata for a clock * @name: The clock name. * @freq_mhz: The clock frequency. - * @pad: MBZ. + * @pad: Structure padding. */ struct amdxdna_drm_query_clock { __u8 name[16]; @@ -373,7 +365,9 @@ struct amdxdna_drm_query_clock_metadata { struct amdxdna_drm_query_clock h_clock; }; -enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; +enum amdxdna_sensor_type { + AMDXDNA_SENSOR_TYPE_POWER +}; /** * struct amdxdna_drm_query_sensor - The data for single sensor. @@ -381,14 +375,12 @@ enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; * @input: The current value of the sensor. * @max: The maximum value possible for the sensor. * @average: The average value of the sensor. - * @highest: The highest recorded sensor value for this driver load for the - * sensor. + * @highest: The highest recorded sensor value for this driver load for the sensor. * @status: The sensor status. * @units: The sensor units. - * @unitm: Translates value member variables into the correct unit via (pow(10, - * unitm) * value) - * @type: The sensor type from enum amdxdna_sensor_type - * @pad: MBZ. + * @unitm: Translates value member variables into the correct unit via (pow(10, unitm) * value). + * @type: The sensor type from enum amdxdna_sensor_type. + * @pad: Structure padding. */ struct amdxdna_drm_query_sensor { __u8 label[64]; @@ -408,14 +400,14 @@ struct amdxdna_drm_query_sensor { * @context_id: The ID for this context. * @start_col: The starting column for the partition assigned to this context. * @num_col: The number of columns in the partition assigned to this context. + * @pad: Structure padding. * @pid: The Process ID of the process that created this context. * @command_submissions: The number of commands submitted to this context. * @command_completions: The number of commands completed by this context. - * @migrations: The number of times this context has been moved to a different - * partition. - * @preemptions: The number of times this context has been preempted by another - * context in the same partition. - * @pad: MBZ. + * @migrations: The number of times this context has been moved to a different partition. + * @preemptions: The number of times this context has been preempted by another context in the + * same partition. + * @errors: The errors for this context. */ struct amdxdna_drm_query_hwctx { __u32 context_id; @@ -471,6 +463,7 @@ enum amdxdna_power_mode_type { POWER_MODE_LOW, /**< Set frequency to lowest DPM */ POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ POWER_MODE_HIGH, /**< Set frequency to highest DPM */ + POWER_MODE_TURBO, /**< More power, more performance */ }; /** @@ -508,20 +501,20 @@ enum amdxdna_drm_get_param { DRM_AMDXDNA_READ_AIE_REG, DRM_AMDXDNA_QUERY_FIRMWARE_VERSION, DRM_AMDXDNA_GET_POWER_MODE, + DRM_AMDXDNA_QUERY_TELEMETRY, DRM_AMDXDNA_NUM_GET_PARAM, }; /** * struct amdxdna_drm_get_info - Get some information from the AIE hardware. - * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed - * in the buffer. + * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed in the buffer. * @buffer_size: Size of the input buffer. Size needed/written by the kernel. * @buffer: A structure specified by the param struct member. */ struct amdxdna_drm_get_info { - __u32 param; /* in */ + __u32 param; /* in */ __u32 buffer_size; /* in/out */ - __u64 buffer; /* in/out */ + __u64 buffer; /* in/out */ }; /** @@ -542,76 +535,56 @@ enum amdxdna_drm_set_param { }; /** - * struct amdxdna_drm_set_state - Set the state of some component within the AIE - * hardware. - * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed - * in the buffer. + * struct amdxdna_drm_set_state - Set the state of some component within the AIE hardware. + * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed in the buffer. * @buffer_size: Size of the input buffer. * @buffer: A structure specified by the param struct member. */ struct amdxdna_drm_set_state { - __u32 param; /* in */ + __u32 param; /* in */ __u32 buffer_size; /* in */ - __u64 buffer; /* in */ + __u64 buffer; /* in */ }; -/** - * struct amdxdna_drm_syncobjs - Signal or wait on array of DRM timelined sync - * objects. - * @handles: Array of handles of sync objects. - * @points: Array of time points for each sync objects. - * @count: Number of elements in the above array. - */ -struct amdxdna_drm_syncobjs { - __u64 handles; /* in */ - __u64 points; /* in */ - __u32 count; /* in */ - __u32 pad; -}; - -#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ - struct amdxdna_drm_create_hwctx) - -#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ - struct amdxdna_drm_destroy_hwctx) - -#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ - struct amdxdna_drm_config_hwctx) +#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ + struct amdxdna_drm_create_hwctx) -#define DRM_IOCTL_AMDXDNA_CREATE_BO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ - struct amdxdna_drm_create_bo) +#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ + struct amdxdna_drm_destroy_hwctx) -#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ - struct amdxdna_drm_get_bo_info) +#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ + struct amdxdna_drm_config_hwctx) -#define DRM_IOCTL_AMDXDNA_SYNC_BO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) +#define DRM_IOCTL_AMDXDNA_CREATE_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ + struct amdxdna_drm_create_bo) -#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) +#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ + struct amdxdna_drm_get_bo_info) -#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) +#define DRM_IOCTL_AMDXDNA_SYNC_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, \ + struct amdxdna_drm_sync_bo) -#define DRM_IOCTL_AMDXDNA_GET_INFO \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, \ + struct amdxdna_drm_exec_cmd) -#define DRM_IOCTL_AMDXDNA_SET_STATE \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ - struct amdxdna_drm_set_state) +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, \ + struct amdxdna_drm_wait_cmd) -#define DRM_IOCTL_AMDXDNA_SUBMIT_WAIT \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT, \ - struct amdxdna_drm_syncobjs) +#define DRM_IOCTL_AMDXDNA_GET_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, \ + struct amdxdna_drm_get_info) -#define DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL, \ - struct amdxdna_drm_syncobjs) +#define DRM_IOCTL_AMDXDNA_SET_STATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ + struct amdxdna_drm_set_state) #if defined(__cplusplus) } /* extern c end */ diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 224b85d7c..9a090e2db 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -165,7 +165,7 @@ class AieAqlQueue : public core::Queue, static hsa_status_t SubmitCmd( uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, - std::unordered_map &vmem_handle_mappings); + std::unordered_map &vmem_addr_mappings); /// @brief Creates a command BO and returns a pointer to the memory and // the corresponding handle @@ -185,7 +185,7 @@ class AieAqlQueue : public core::Queue, static void RegisterCmdBOs( uint32_t count, std::vector &bo_args, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, - std::unordered_map &vmem_handle_mappings); + std::unordered_map &vmem_addr_mappings); /// @brief Syncs all BOs referenced in bo_args /// diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 79cbaa710..ab089a95a 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -71,6 +71,7 @@ class XdnaDriver : public core::Driver { hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override; hsa_status_t GetHandleMappings(std::unordered_map &vmem_handle_mappings); + hsa_status_t GetAddrMappings(std::unordered_map &vmem_addr_mappings); hsa_status_t GetFd(int &fd); hsa_status_t GetAgentProperties(core::Agent &agent) const override; @@ -108,16 +109,14 @@ class XdnaDriver : public core::Driver { /// @param config_cu_param CU configuration information. hsa_status_t ConfigHwCtxCU(core::Queue &queue, - hsa_amd_aie_ert_hw_ctx_config_cu_param_t &config_cu_param); + hsa_amd_aie_ert_hw_ctx_config_cu_param_addr_t &config_cu_param); - /// TODO: Probably remove this in the future and rely on the core Runtime + /// TODO: Probably remove these in the future and rely on the core Runtime /// object to track handle allocations. Using the VMEM API for mapping XDNA /// driver handles requires a bit more refactoring. So rely on the XDNA driver /// to manage some of this for now. std::unordered_map vmem_handle_mappings; - - // TODO: Remove this once we move to the vmem API - std::unordered_map vmem_handle_mappings_reverse; + std::unordered_map vmem_addr_mappings; /// @brief Virtual address range allocated for the device heap. /// @@ -128,7 +127,7 @@ class XdnaDriver : public core::Driver { /// @brief The aligned device heap. void *dev_heap_aligned = nullptr; - static constexpr size_t dev_heap_size = 48 * 1024 * 1024; + static constexpr size_t dev_heap_size = 64 * 1024 * 1024; static constexpr size_t dev_heap_align = 64 * 1024 * 1024; }; diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 283b5af60..022f794b0 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -86,6 +86,9 @@ constexpr int DEFAULT_TIMEOUT_VAL = 50; char *timeout_env_var_ptr = getenv(TIMEOUT_ENV_VAR); int timeout_val = timeout_env_var_ptr == nullptr ? DEFAULT_TIMEOUT_VAL : atoi(timeout_env_var_ptr); +// Macro to concatenate two 32-bits addresses into a 64-bit address +#define CONCAT_ADDR(HIGH, LOW) ((static_cast(HIGH) << 32) | static_cast(LOW)) + namespace rocr { namespace AMD { @@ -223,11 +226,11 @@ uint64_t AieAqlQueue::AddWriteIndexAcqRel(uint64_t value) { } void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { - std::unordered_map vmem_handle_mappings; + std::unordered_map vmem_addr_mappings; auto &driver = static_cast( core::Runtime::runtime_singleton_->AgentDriver(agent_.driver_type)); - if (driver.GetHandleMappings(vmem_handle_mappings) != HSA_STATUS_SUCCESS) { + if (driver.GetAddrMappings(vmem_addr_mappings) != HSA_STATUS_SUCCESS) { return; } @@ -238,13 +241,15 @@ void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { SubmitCmd(hw_ctx_handle_, fd, amd_queue_.hsa_queue.base_address, amd_queue_.read_dispatch_id, amd_queue_.write_dispatch_id, - vmem_handle_mappings); + vmem_addr_mappings); } hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, int fd) { for (unsigned int bo_arg : bo_args) { amdxdna_drm_sync_bo sync_params = {}; sync_params.handle = bo_arg; + sync_params.offset = 0; + sync_params.size = 4 * 1024; // TODO: Actually figure this out if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) return HSA_STATUS_ERROR; } @@ -273,7 +278,7 @@ hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, void AieAqlQueue::RegisterCmdBOs( uint32_t count, std::vector &bo_args, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, - std::unordered_map &vmem_handle_mappings) { + std::unordered_map &vmem_addr_mappings) { // This is the index where the operand addresses start in a command const int operand_starting_index = 5; @@ -281,33 +286,31 @@ void AieAqlQueue::RegisterCmdBOs( // Operands are 64-bits so we need to divide by two uint32_t num_operands = (count - NON_OPERAND_COUNT) / 2; + uint64_t instr_addr = CONCAT_ADDR(cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX + 1], cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]); + auto instr_handle = vmem_addr_mappings.find(reinterpret_cast(instr_addr)); + + if (instr_handle == vmem_addr_mappings.end()) + return; + // Keep track of the handles before we submit the packet - bo_args.push_back( - cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]); + bo_args.push_back(instr_handle->second); // Going through all of the operands in the command, keeping track of the // handles and turning the handles into addresses. The starting index of // the operands in a command is `operand_starting_index` and the fields // are 32-bits we need to iterate over every two for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) { - bo_args.push_back( - cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]); - // clang-format off - cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter + 1] = - (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] >> 32 & 0xFFFFFFFF; - cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter] = - (uint64_t)vmem_handle_mappings[cmd_pkt_payload->data[operand_starting_index + 2 * operand_iter]] & 0xFFFFFFFF; - // clang-format on + + uint32_t operand_index = operand_starting_index + 2 * operand_iter; + uint64_t operand_addr = CONCAT_ADDR(cmd_pkt_payload->data[operand_index + 1], cmd_pkt_payload->data[operand_index]); + auto operand_handle = vmem_addr_mappings.find(reinterpret_cast(operand_addr)); + if (operand_handle == vmem_addr_mappings.end()) + return; + bo_args.push_back(operand_handle->second); } // Transform the instruction sequence address into device address - cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] = - DEV_ADDR_BASE | - (reinterpret_cast( - vmem_handle_mappings - [cmd_pkt_payload - ->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX]]) & - DEV_ADDR_OFFSET_MASK); + cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] = DEV_ADDR_BASE | instr_addr & DEV_ADDR_OFFSET_MASK; } hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, @@ -335,7 +338,7 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle, hsa_status_t AieAqlQueue::SubmitCmd( uint32_t hw_ctx_handle, int fd, void *queue_base, uint64_t read_dispatch_id, uint64_t write_dispatch_id, - std::unordered_map &vmem_handle_mappings) { + std::unordered_map &vmem_addr_mappings) { uint64_t cur_id = read_dispatch_id; while (cur_id < write_dispatch_id) { hsa_amd_aie_ert_packet_t *pkt = @@ -376,7 +379,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( // Add the handles for all of the BOs to bo_args as well as rewrite // the command payload handles to contain the actual virtual addresses - RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_handle_mappings); + RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_addr_mappings); // Creating a packet that contains the command to execute the kernel uint32_t cmd_bo_handle = 0; diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h index be7e61b41..43600075a 100644 --- a/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -1321,6 +1321,7 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, /** * @brief Hardware context configuration for one AIE CU. + * specifying handle of CU configuration. */ typedef struct hsa_amd_aie_ert_hw_ctx_cu_config_s { /** @@ -1334,6 +1335,22 @@ typedef struct hsa_amd_aie_ert_hw_ctx_cu_config_s { uint8_t reserved[3]; } hsa_amd_aie_ert_hw_ctx_cu_config_t; +/** + * @brief Hardware context configuration for one AIE CU. + * specifying address of CU configuration. + */ +typedef struct hsa_amd_aie_ert_hw_ctx_cu_config_addr_s { + /** + * @brief CU configuration BO handle. + */ + uint64_t cu_config_addr; + /** + * @brief Function of a CU. + */ + uint8_t cu_func; + uint8_t reserved[3]; +} hsa_amd_aie_ert_hw_ctx_cu_config_addr_t; + typedef struct hsa_amd_aie_ert_hw_ctx_config_cu_param_s { /** * @brief Number of CUs to configure. @@ -1346,6 +1363,18 @@ typedef struct hsa_amd_aie_ert_hw_ctx_config_cu_param_s { hsa_amd_aie_ert_hw_ctx_cu_config_t *cu_configs; } hsa_amd_aie_ert_hw_ctx_config_cu_param_t; +typedef struct hsa_amd_aie_ert_hw_ctx_config_cu_param_addr_s { + /** + * @brief Number of CUs to configure. + */ + uint16_t num_cus; + uint16_t reserved[3]; + /** + * @brief List of CU configurations. + */ + hsa_amd_aie_ert_hw_ctx_cu_config_addr_t *cu_configs; +} hsa_amd_aie_ert_hw_ctx_config_cu_param_addr_t; + /** * brief Specify a hardware context configuration parameter type for a queue. */ From 82cf234b7ac4ecf0227fa272781be1528051a535 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Wed, 30 Oct 2024 16:16:32 -0600 Subject: [PATCH 2/7] Removing sideband to get handle from virtual address --- rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 17 ----------------- .../core/common/hsa_table_interface.cpp | 5 ----- .../core/driver/kfd/amd_kfd_driver.cpp | 5 ----- .../core/driver/xdna/amd_xdna_driver.cpp | 8 -------- runtime/hsa-runtime/core/inc/amd_kfd_driver.h | 1 - runtime/hsa-runtime/core/inc/amd_xdna_driver.h | 1 - runtime/hsa-runtime/core/inc/driver.h | 2 -- runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h | 3 --- runtime/hsa-runtime/core/inc/runtime.h | 2 -- .../hsa-runtime/core/runtime/hsa_api_trace.cpp | 3 +-- .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 13 ------------- runtime/hsa-runtime/core/runtime/runtime.cpp | 11 ----------- runtime/hsa-runtime/inc/hsa_api_trace.h | 1 - runtime/hsa-runtime/inc/hsa_ext_amd.h | 3 --- 14 files changed, 1 insertion(+), 74 deletions(-) diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index ce3c7135e..64c895be4 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -204,17 +204,8 @@ int main(int argc, char **argv) { uint32_t num_instr; load_instr_file(global_dev_mem_pool, instr_inst_file_name, reinterpret_cast(&instr_inst_buf), num_instr); - uint32_t instr_handle = 0; - r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle); - assert(r == HSA_STATUS_SUCCESS); - assert(instr_handle != 0); - load_pdi_file(global_dev_mem_pool, pdi_file_name, reinterpret_cast(&pdi_buf)); - uint32_t pdi_handle = 0; - r = hsa_amd_get_handle_from_vaddr(pdi_buf, &pdi_handle); - assert(r == HSA_STATUS_SUCCESS); - assert(pdi_handle != 0); hsa_amd_aie_ert_hw_ctx_cu_config_addr_t cu_config{.cu_config_addr = reinterpret_cast(pdi_buf), .cu_func = 0}; @@ -235,8 +226,6 @@ int main(int argc, char **argv) { std::vector input(num_pkts); std::vector output(num_pkts); std::vector cmd_payloads(num_pkts); - std::vector input_handle(num_pkts); - std::vector output_handle(num_pkts); uint64_t wr_idx = 0; uint64_t packet_id = 0; @@ -245,16 +234,10 @@ int main(int argc, char **argv) { r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, reinterpret_cast(&input[pkt_iter])); assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]); - assert(r == HSA_STATUS_SUCCESS); - assert(input_handle[pkt_iter] != 0); r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0, reinterpret_cast(&output[pkt_iter])); assert(r == HSA_STATUS_SUCCESS); - r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]); - assert(r == HSA_STATUS_SUCCESS); - assert(output_handle[pkt_iter] != 0); for (std::size_t i = 0; i < num_data_elements; i++) { *(input[pkt_iter] + i) = i * (pkt_iter + 1); diff --git a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp index 69b938e66..fa41e7ba6 100644 --- a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp +++ b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -930,11 +930,6 @@ hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue, return amdExtTable->hsa_amd_queue_hw_ctx_config_fn(queue, config_type, args); } -// Mirrors AMD Extension APIs. -hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle) { - return amdExtTable->hsa_amd_get_handle_from_vaddr_fn(ptr, handle); -} - // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp index f405bf502..27b1aebf1 100644 --- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp +++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp @@ -257,11 +257,6 @@ KfdDriver::ConfigHwCtx(core::Queue &queue, return HSA_STATUS_ERROR_INVALID_AGENT; } -hsa_status_t KfdDriver::GetHandleFromVaddr(void* ptr, uint32_t* handle) { - // Only AIE queues support this for now. - return HSA_STATUS_ERROR_INVALID_AGENT; -} - void *KfdDriver::AllocateKfdMemory(const HsaMemFlags &flags, uint32_t node_id, size_t size) { void *mem = nullptr; diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index 0437e2dec..fd65f8c51 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -278,14 +278,6 @@ XdnaDriver::ConfigHwCtx(core::Queue &queue, } } -hsa_status_t XdnaDriver::GetHandleFromVaddr(void* ptr, uint32_t* handle) { - auto it = vmem_addr_mappings.find(ptr); - if (it == vmem_addr_mappings.end()) - return HSA_STATUS_ERROR_INVALID_ALLOCATION; - *handle = it->second; - return HSA_STATUS_SUCCESS; -} - hsa_status_t XdnaDriver::QueryDriverVersion() { amdxdna_drm_query_aie_version aie_version{0, 0}; amdxdna_drm_get_info args{DRM_AMDXDNA_QUERY_AIE_VERSION, sizeof(aie_version), diff --git a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h index bd6f376fb..190aabae5 100644 --- a/runtime/hsa-runtime/core/inc/amd_kfd_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_kfd_driver.h @@ -82,7 +82,6 @@ class KfdDriver : public core::Driver { hsa_status_t ConfigHwCtx(core::Queue &queue, hsa_amd_queue_hw_ctx_config_param_t config_type, void *args) override; - hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) override; private: /// @brief Allocate agent accessible memory (system / local memory). diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index ab089a95a..158091b56 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -94,7 +94,6 @@ class XdnaDriver : public core::Driver { hsa_amd_queue_hw_ctx_config_param_t config_type, void *args) override; - hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) override; private: hsa_status_t QueryDriverVersion(); diff --git a/runtime/hsa-runtime/core/inc/driver.h b/runtime/hsa-runtime/core/inc/driver.h index 580a12ccb..094eb913d 100644 --- a/runtime/hsa-runtime/core/inc/driver.h +++ b/runtime/hsa-runtime/core/inc/driver.h @@ -138,8 +138,6 @@ class Driver { ConfigHwCtx(Queue &queue, hsa_amd_queue_hw_ctx_config_param_t config_type, void *args) = 0; - virtual hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle) = 0; - /// Unique identifier for supported kernel-mode drivers. const DriverType kernel_driver_type_; diff --git a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h index d10300e89..e34a5bab1 100644 --- a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h +++ b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h @@ -114,9 +114,6 @@ hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue, hsa_amd_queue_hw_ctx_config_param_t config_type, void *args); -// Mirrors AMD Extension APIs. -hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle); - // Mirrors Amd Extension Apis hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, diff --git a/runtime/hsa-runtime/core/inc/runtime.h b/runtime/hsa-runtime/core/inc/runtime.h index 3e3f48294..981bd4852 100644 --- a/runtime/hsa-runtime/core/inc/runtime.h +++ b/runtime/hsa-runtime/core/inc/runtime.h @@ -403,8 +403,6 @@ class Runtime { const core::MemoryRegion** mem_region, hsa_amd_memory_type_t* type); - hsa_status_t GetHandleFromVaddr(void* ptr, uint32_t* handle); - hsa_status_t EnableLogging(uint8_t* flags, void* file); const std::vector& cpu_agents() { return cpu_agents_; } diff --git a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index b4985c3be..cee7ebac2 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -80,7 +80,7 @@ void HsaApiTable::Init() { // they can add preprocessor macros on the new functions constexpr size_t expected_core_api_table_size = 1016; - constexpr size_t expected_amd_ext_table_size = 600;//592; + constexpr size_t expected_amd_ext_table_size = 592; constexpr size_t expected_image_ext_table_size = 120; constexpr size_t expected_finalizer_ext_table_size = 64; constexpr size_t expected_tools_table_size = 64; @@ -407,7 +407,6 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_async_function_fn = AMD::hsa_amd_async_function; amd_ext_api.hsa_amd_signal_wait_any_fn = AMD::hsa_amd_signal_wait_any; amd_ext_api.hsa_amd_queue_hw_ctx_config_fn = AMD::hsa_amd_queue_hw_ctx_config; - amd_ext_api.hsa_amd_get_handle_from_vaddr_fn = AMD::hsa_amd_get_handle_from_vaddr; amd_ext_api.hsa_amd_queue_cu_set_mask_fn = AMD::hsa_amd_queue_cu_set_mask; amd_ext_api.hsa_amd_queue_cu_get_mask_fn = AMD::hsa_amd_queue_cu_get_mask; amd_ext_api.hsa_amd_memory_pool_get_info_fn = AMD::hsa_amd_memory_pool_get_info; diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index 0184e175e..b9207d8bb 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -634,19 +634,6 @@ hsa_amd_queue_hw_ctx_config(const hsa_queue_t *queue, CATCH; } - -hsa_status_t hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle) { - TRY; - IS_OPEN(); - - IS_BAD_PTR(ptr); - IS_BAD_PTR(handle); - - return core::Runtime::runtime_singleton_->GetHandleFromVaddr(ptr, handle); - - CATCH; -} - hsa_status_t hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, const uint32_t* cu_mask) { TRY; diff --git a/runtime/hsa-runtime/core/runtime/runtime.cpp b/runtime/hsa-runtime/core/runtime/runtime.cpp index a266df260..ab53fbe98 100644 --- a/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -3605,17 +3605,6 @@ hsa_status_t Runtime::VMemoryGetAllocPropertiesFromHandle(hsa_amd_vmem_alloc_han return HSA_STATUS_SUCCESS; } -hsa_status_t Runtime::GetHandleFromVaddr(void* ptr, uint32_t* handle) { - auto it = allocation_map_.find(ptr); - if (it == allocation_map_.end()) { - return HSA_STATUS_ERROR_INVALID_ALLOCATION; - } - - auto* agent = it->second.region->owner(); - auto& driver = AgentDriver(agent->driver_type); - return driver.GetHandleFromVaddr(ptr, handle); -} - hsa_status_t Runtime::EnableLogging(uint8_t* flags, void* file) { memcpy(log_flags, flags, sizeof(log_flags)); diff --git a/runtime/hsa-runtime/inc/hsa_api_trace.h b/runtime/hsa-runtime/inc/hsa_api_trace.h index a5d85e85c..3cadef74e 100644 --- a/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -205,7 +205,6 @@ struct AmdExtTable { decltype(hsa_amd_async_function)* hsa_amd_async_function_fn; decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn; decltype(hsa_amd_queue_hw_ctx_config) *hsa_amd_queue_hw_ctx_config_fn; - decltype(hsa_amd_get_handle_from_vaddr)* hsa_amd_get_handle_from_vaddr_fn; decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn; decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn; decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn; diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h index 43600075a..56c54b486 100644 --- a/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -1404,9 +1404,6 @@ hsa_status_t HSA_API hsa_amd_queue_hw_ctx_config( const hsa_queue_t *queue, hsa_amd_queue_hw_ctx_config_param_t config_type, void *args); - -hsa_status_t HSA_API hsa_amd_get_handle_from_vaddr(void* ptr, uint32_t* handle); - /** * @brief Set a queue's CU affinity mask. * From 4228ba89bf005d7cac282c49f75578e02d528269 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Wed, 30 Oct 2024 16:49:04 -0600 Subject: [PATCH 3/7] Passing the size of each operand in the packet so we can sync each buffer properly --- rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 2 ++ .../hsa-runtime/core/inc/amd_aie_aql_queue.h | 4 +-- .../core/runtime/amd_aie_aql_queue.cpp | 36 ++++++++++++++----- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index 64c895be4..33117905a 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -277,6 +277,8 @@ int main(int argc, char **argv) { cmd_payload->data[6] = HIGH_ADDR(input[pkt_iter]); cmd_payload->data[7] = LOW_ADDR(output[pkt_iter]); cmd_payload->data[8] = HIGH_ADDR(output[pkt_iter]); + cmd_payload->data[9] = num_data_elements * sizeof(uint32_t); + cmd_payload->data[10] = num_data_elements * sizeof(uint32_t); cmd_pkt->payload_data = reinterpret_cast(cmd_payload); // Keeping track of payloads so we can free them at the end diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 9a090e2db..35bfb3573 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -183,14 +183,14 @@ class AieAqlQueue : public core::Queue, /// @param bo_args A pointer to a vector that contains all bo handles /// @param cmd_pkt_payload A pointer to the payload of the command static void RegisterCmdBOs( - uint32_t count, std::vector &bo_args, + uint32_t count, std::vector &bo_args, std::vector &bo_sizes, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, std::unordered_map &vmem_addr_mappings); /// @brief Syncs all BOs referenced in bo_args /// /// @param bo_args vector containing handles of BOs to sync - static hsa_status_t SyncBos(std::vector &bo_args, int fd); + static hsa_status_t SyncBos(std::vector &bo_args,std::vector &bo_sizes, int fd); /// @brief Executes a command and waits for its completion /// diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index 022f794b0..c5739b90b 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -76,9 +76,13 @@ constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF; // https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637 constexpr int CMD_COUNT_SIZE_INCREASE = 3; +// The size of an instruction in bytes +constexpr int INSTR_SIZE_BYTES = 4; + // Index of command payload where the instruction sequence // address is located constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX = 2; +constexpr int CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX = 4; // Environment variable to define job submission timeout constexpr const char *TIMEOUT_ENV_VAR = "ROCR_AIE_TIMEOUT"; @@ -244,12 +248,16 @@ void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { vmem_addr_mappings); } -hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, int fd) { - for (unsigned int bo_arg : bo_args) { +hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, std::vector &bo_sizes, int fd) { + + if (bo_args.size() != bo_sizes.size()) + return HSA_STATUS_ERROR; + + for (int i = 0; i < bo_args.size(); i++) { amdxdna_drm_sync_bo sync_params = {}; - sync_params.handle = bo_arg; + sync_params.handle = bo_args[i]; sync_params.offset = 0; - sync_params.size = 4 * 1024; // TODO: Actually figure this out + sync_params.size = bo_sizes[i]; if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) return HSA_STATUS_ERROR; } @@ -277,7 +285,7 @@ hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, void AieAqlQueue::RegisterCmdBOs( uint32_t count, std::vector &bo_args, - hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, + std::vector &bo_sizes, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, std::unordered_map &vmem_addr_mappings) { // This is the index where the operand addresses start in a command const int operand_starting_index = 5; @@ -295,6 +303,10 @@ void AieAqlQueue::RegisterCmdBOs( // Keep track of the handles before we submit the packet bo_args.push_back(instr_handle->second); + // Adding the instruction sequence size. The packet contains the number of instructions. + uint32_t instr_bo_size = cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX] * INSTR_SIZE_BYTES; + bo_sizes.push_back(instr_bo_size); + // Going through all of the operands in the command, keeping track of the // handles and turning the handles into addresses. The starting index of // the operands in a command is `operand_starting_index` and the fields @@ -309,6 +321,13 @@ void AieAqlQueue::RegisterCmdBOs( bo_args.push_back(operand_handle->second); } + // Going through all of the operands in the command, keeping track of + // the sizes of each operand. The size is used to sync the buffer + uint32_t operand_size_starting_index = operand_starting_index + 2 * num_operands; + for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) { + bo_sizes.push_back(cmd_pkt_payload->data[operand_size_starting_index + operand_iter]); + } + // Transform the instruction sequence address into device address cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_IDX] = DEV_ADDR_BASE | instr_addr & DEV_ADDR_OFFSET_MASK; } @@ -353,6 +372,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( switch (pkt->opcode) { case HSA_AMD_AIE_ERT_START_CU: { std::vector bo_args; + std::vector bo_sizes; std::vector cmd_handles; std::vector cmd_sizes; std::vector cmds; @@ -379,7 +399,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( // Add the handles for all of the BOs to bo_args as well as rewrite // the command payload handles to contain the actual virtual addresses - RegisterCmdBOs(pkt->count, bo_args, cmd_pkt_payload, vmem_addr_mappings); + RegisterCmdBOs(pkt->count, bo_args, bo_sizes, cmd_pkt_payload, vmem_addr_mappings); // Creating a packet that contains the command to execute the kernel uint32_t cmd_bo_handle = 0; @@ -428,7 +448,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( } // Syncing BOs before we execute the command - if (SyncBos(bo_args, fd)) + if (SyncBos(bo_args, bo_sizes, fd)) return HSA_STATUS_ERROR; // Removing duplicates in the bo container. The driver will report @@ -465,7 +485,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close_bo_args); // Syncing BOs after we execute the command - if (SyncBos(bo_args, fd)) + if (SyncBos(bo_args, bo_sizes, fd)) return HSA_STATUS_ERROR; cur_id += num_cont_start_cu_pkts; From a48c20bc8f8ec403d7478192696dea694b9d2400 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Wed, 30 Oct 2024 20:15:21 -0600 Subject: [PATCH 4/7] Syncing BOs in userspace instead of making a syscall --- rocrtst/suites/aie/aie_hsa_dispatch_test.cc | 14 ++++++---- .../core/driver/xdna/amd_xdna_driver.cpp | 8 +----- .../hsa-runtime/core/inc/amd_aie_aql_queue.h | 4 +-- .../hsa-runtime/core/inc/amd_xdna_driver.h | 19 +++++++++++++ .../core/runtime/amd_aie_aql_queue.cpp | 28 +++++++++---------- runtime/hsa-runtime/inc/hsa_ext_amd.h | 5 ++++ 6 files changed, 49 insertions(+), 29 deletions(-) diff --git a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc index 33117905a..ca56a97e0 100644 --- a/rocrtst/suites/aie/aie_hsa_dispatch_test.cc +++ b/rocrtst/suites/aie/aie_hsa_dispatch_test.cc @@ -100,7 +100,7 @@ hsa_status_t get_coarse_global_kernarg_mem_pool(hsa_amd_memory_pool_t pool, } void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, - void **buf) { + void **buf, uint32_t &pdi_size) { std::ifstream bin_file(file_name, std::ios::binary | std::ios::ate | std::ios::in); @@ -112,6 +112,7 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf); assert(r == HSA_STATUS_SUCCESS); bin_file.read(reinterpret_cast(*buf), size); + pdi_size = size; } void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name, @@ -202,15 +203,18 @@ int main(int argc, char **argv) { // Load the DPU and PDI files into a global pool that doesn't support kernel // args (DEV BO). uint32_t num_instr; + uint32_t pdi_size; load_instr_file(global_dev_mem_pool, instr_inst_file_name, reinterpret_cast(&instr_inst_buf), num_instr); load_pdi_file(global_dev_mem_pool, pdi_file_name, - reinterpret_cast(&pdi_buf)); + reinterpret_cast(&pdi_buf), pdi_size); - hsa_amd_aie_ert_hw_ctx_cu_config_addr_t cu_config{.cu_config_addr = reinterpret_cast(pdi_buf), - .cu_func = 0}; + hsa_amd_aie_ert_hw_ctx_cu_config_addr_t cu_config { + .cu_config_addr = reinterpret_cast(pdi_buf), + .cu_func = 0, + .cu_size = pdi_size}; - hsa_amd_aie_ert_hw_ctx_config_cu_param_addr_t config_cu_args{ + hsa_amd_aie_ert_hw_ctx_config_cu_param_addr_t config_cu_args { .num_cus = 1, .cu_configs = &cu_config}; // Configure the queue's hardware context. diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index fd65f8c51..a6ef6b879 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -409,13 +409,7 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU( config_cu_param.cu_configs[i].cu_func; // sync configuration buffer - amdxdna_drm_sync_bo sync_args = {}; - sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo; - sync_args.offset = 0; - sync_args.size = 4 * 1024; // TODO: Try this with the proper size - if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) { - return HSA_STATUS_ERROR; - } + clflush_data(reinterpret_cast(config_cu_param.cu_configs[i].cu_config_addr), 0, config_cu_param.cu_configs[i].cu_size); } amdxdna_drm_config_hwctx config_hw_ctx_args{ diff --git a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h index 35bfb3573..a2450da65 100644 --- a/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h +++ b/runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h @@ -184,13 +184,13 @@ class AieAqlQueue : public core::Queue, /// @param cmd_pkt_payload A pointer to the payload of the command static void RegisterCmdBOs( uint32_t count, std::vector &bo_args, std::vector &bo_sizes, - hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, + std::vector &bo_addrs, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, std::unordered_map &vmem_addr_mappings); /// @brief Syncs all BOs referenced in bo_args /// /// @param bo_args vector containing handles of BOs to sync - static hsa_status_t SyncBos(std::vector &bo_args,std::vector &bo_sizes, int fd); + static hsa_status_t SyncBos(std::vector &bo_args,std::vector &bo_sizes, int fd); /// @brief Executes a command and waits for its completion /// diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 158091b56..6764e0835 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -44,11 +44,30 @@ #include #include +#include #include "core/inc/driver.h" #include "core/inc/memory_region.h" #include "core/driver/xdna/uapi/amdxdna_accel.h" +// Flushes the cache lines assocaited with a BO. This +// is used to sync a BO without going to the xdna driver. +inline void +clflush_data(const void *base, size_t offset, size_t len) +{ + + // Getting the cacheline size of the system + uint64_t cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + + // Flushing each cache line in the provided address range + uint64_t base_addr = reinterpret_cast(base) + offset; + for (int i = 0; i < len / cacheline_size; i++) { + uint64_t cur_addr = base_addr + cacheline_size * i; + _mm_clflush(reinterpret_cast(cur_addr)); + } + +} + namespace rocr { namespace core { class Queue; diff --git a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp index c5739b90b..ff97a3185 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp @@ -248,18 +248,13 @@ void AieAqlQueue::StoreRelaxed(hsa_signal_value_t value) { vmem_addr_mappings); } -hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_args, std::vector &bo_sizes, int fd) { +hsa_status_t AieAqlQueue::SyncBos(std::vector &bo_addrs, std::vector &bo_sizes, int fd) { - if (bo_args.size() != bo_sizes.size()) + if (bo_addrs.size() != bo_sizes.size()) return HSA_STATUS_ERROR; - for (int i = 0; i < bo_args.size(); i++) { - amdxdna_drm_sync_bo sync_params = {}; - sync_params.handle = bo_args[i]; - sync_params.offset = 0; - sync_params.size = bo_sizes[i]; - if (ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params)) - return HSA_STATUS_ERROR; + for (int i = 0; i < bo_addrs.size(); i++) { + clflush_data(reinterpret_cast(bo_addrs[i]), 0, bo_sizes[i]); } return HSA_STATUS_SUCCESS; @@ -285,7 +280,7 @@ hsa_status_t AieAqlQueue::ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd, void AieAqlQueue::RegisterCmdBOs( uint32_t count, std::vector &bo_args, - std::vector &bo_sizes, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, + std::vector &bo_sizes, std::vector &bo_addrs, hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload, std::unordered_map &vmem_addr_mappings) { // This is the index where the operand addresses start in a command const int operand_starting_index = 5; @@ -300,15 +295,16 @@ void AieAqlQueue::RegisterCmdBOs( if (instr_handle == vmem_addr_mappings.end()) return; - // Keep track of the handles before we submit the packet + // Keep track of the handles and addresses before we submit the packet bo_args.push_back(instr_handle->second); + bo_addrs.push_back(instr_addr); // Adding the instruction sequence size. The packet contains the number of instructions. uint32_t instr_bo_size = cmd_pkt_payload->data[CMD_PKT_PAYLOAD_INSTRUCTION_SEQUENCE_SIZE_IDX] * INSTR_SIZE_BYTES; bo_sizes.push_back(instr_bo_size); // Going through all of the operands in the command, keeping track of the - // handles and turning the handles into addresses. The starting index of + // addresses and turning the addresses into handles. The starting index of // the operands in a command is `operand_starting_index` and the fields // are 32-bits we need to iterate over every two for (int operand_iter = 0; operand_iter < num_operands; operand_iter++) { @@ -319,6 +315,7 @@ void AieAqlQueue::RegisterCmdBOs( if (operand_handle == vmem_addr_mappings.end()) return; bo_args.push_back(operand_handle->second); + bo_addrs.push_back(operand_addr); } // Going through all of the operands in the command, keeping track of @@ -373,6 +370,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( case HSA_AMD_AIE_ERT_START_CU: { std::vector bo_args; std::vector bo_sizes; + std::vector bo_addrs; std::vector cmd_handles; std::vector cmd_sizes; std::vector cmds; @@ -399,7 +397,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( // Add the handles for all of the BOs to bo_args as well as rewrite // the command payload handles to contain the actual virtual addresses - RegisterCmdBOs(pkt->count, bo_args, bo_sizes, cmd_pkt_payload, vmem_addr_mappings); + RegisterCmdBOs(pkt->count, bo_args, bo_sizes, bo_addrs, cmd_pkt_payload, vmem_addr_mappings); // Creating a packet that contains the command to execute the kernel uint32_t cmd_bo_handle = 0; @@ -448,7 +446,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( } // Syncing BOs before we execute the command - if (SyncBos(bo_args, bo_sizes, fd)) + if (SyncBos(bo_addrs, bo_sizes, fd)) return HSA_STATUS_ERROR; // Removing duplicates in the bo container. The driver will report @@ -485,7 +483,7 @@ hsa_status_t AieAqlQueue::SubmitCmd( ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close_bo_args); // Syncing BOs after we execute the command - if (SyncBos(bo_args, bo_sizes, fd)) + if (SyncBos(bo_addrs, bo_sizes, fd)) return HSA_STATUS_ERROR; cur_id += num_cont_start_cu_pkts; diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h index 56c54b486..b92c6a094 100644 --- a/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -1349,6 +1349,11 @@ typedef struct hsa_amd_aie_ert_hw_ctx_cu_config_addr_s { */ uint8_t cu_func; uint8_t reserved[3]; + + /** + * @brief The size of the CU configuration + */ + uint32_t cu_size; } hsa_amd_aie_ert_hw_ctx_cu_config_addr_t; typedef struct hsa_amd_aie_ert_hw_ctx_config_cu_param_s { From 4fc64f63ab05fdbd26912df23a57fa9fc029444d Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Wed, 30 Oct 2024 20:26:40 -0600 Subject: [PATCH 5/7] [SQUASH] CI seems to need this --- runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h b/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h index 03e74d387..fa3985cc3 100644 --- a/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h +++ b/runtime/hsa-runtime/core/driver/xdna/uapi/amdxdna_accel.h @@ -14,6 +14,10 @@ extern "C" { #endif +#ifndef __counted_by +#define __counted_by(cnt) +#endif + #define AMDXDNA_DRIVER_MAJOR 1 #define AMDXDNA_DRIVER_MINOR 0 From 8ae1c33a5c5ff451b4c0dedb2733ca2f0f301ef7 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Thu, 31 Oct 2024 12:06:11 -0600 Subject: [PATCH 6/7] [SQUASH] Reverting cache flush modifications --- .../hsa-runtime/core/inc/amd_xdna_driver.h | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h index 6764e0835..3da87b4b9 100644 --- a/runtime/hsa-runtime/core/inc/amd_xdna_driver.h +++ b/runtime/hsa-runtime/core/inc/amd_xdna_driver.h @@ -52,20 +52,26 @@ // Flushes the cache lines assocaited with a BO. This // is used to sync a BO without going to the xdna driver. +// This is from the XRT KMQ shim. inline void clflush_data(const void *base, size_t offset, size_t len) { + static long cacheline_size = 0; - // Getting the cacheline size of the system - uint64_t cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); - - // Flushing each cache line in the provided address range - uint64_t base_addr = reinterpret_cast(base) + offset; - for (int i = 0; i < len / cacheline_size; i++) { - uint64_t cur_addr = base_addr + cacheline_size * i; - _mm_clflush(reinterpret_cast(cur_addr)); + if (!cacheline_size) { + long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (sz <= 0) + return; + cacheline_size = sz; } + const char *cur = (const char *)base; + cur += offset; + uintptr_t lastline = (uintptr_t)(cur + len - 1) | (cacheline_size - 1); + do { + _mm_clflush(cur); + cur += cacheline_size; + } while (cur <= (const char *)lastline); } namespace rocr { From 783923018add5ef89816eef0ac986ef20b64e2b4 Mon Sep 17 00:00:00 2001 From: Eddie Richter Date: Thu, 31 Oct 2024 12:22:24 -0600 Subject: [PATCH 7/7] [SQUASH] Changing order of create bo ioctl struct --- runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp index a6ef6b879..7dc465639 100644 --- a/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp +++ b/runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp @@ -294,10 +294,9 @@ hsa_status_t XdnaDriver::QueryDriverVersion() { } hsa_status_t XdnaDriver::InitDeviceHeap() { - amdxdna_drm_create_bo create_bo_args{.type = AMDXDNA_BO_DEV_HEAP, - .vaddr = - reinterpret_cast(nullptr), - .size = dev_heap_size}; + amdxdna_drm_create_bo create_bo_args{.vaddr = reinterpret_cast(nullptr), + .size = dev_heap_size, + .type = AMDXDNA_BO_DEV_HEAP}; amdxdna_drm_get_bo_info get_bo_info_args{0}; drm_gem_close close_bo_args{0};