Skip to content
This repository has been archived by the owner on Dec 24, 2024. It is now read-only.

Fixing command sizing and changing the test to issue the maximum number of packets the queue supports. #23

Merged
merged 3 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 99 additions & 78 deletions rocrtst/suites/aie/aie_hsa_dispatch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ void load_pdi_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
bin_file.read(reinterpret_cast<char *>(*buf), size);
}

void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf) {
void load_instr_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
void **buf, uint32_t &num_instr) {
std::ifstream bin_file(file_name,
std::ios::binary | std::ios::ate | std::ios::in);

Expand All @@ -129,6 +129,7 @@ void load_dpu_file(hsa_amd_memory_pool_t mem_pool, const std::string &file_name,
auto r = hsa_amd_memory_pool_allocate(mem_pool, size, 0, buf);
assert(r == HSA_STATUS_SUCCESS);
std::memcpy(*buf, pdi_vec.data(), pdi_vec.size() * sizeof(uint32_t));
num_instr = pdi_vec.size();
}

} // namespace
Expand All @@ -144,9 +145,9 @@ int main(int argc, char **argv) {
hsa_amd_memory_pool_t global_dev_mem_pool{0};
// System memory pool. Used for allocating kernel argument data.
hsa_amd_memory_pool_t global_kernarg_mem_pool{0};
const std::string dpu_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string instr_inst_file_name(sourcePath / "add_one_insts.txt");
const std::string pdi_file_name(sourcePath / "add_one.pdi");
uint32_t *dpu_inst_buf(nullptr);
uint32_t *instr_inst_buf(nullptr);
uint64_t *pdi_buf(nullptr);

assert(aie_agents.empty());
Expand All @@ -164,8 +165,6 @@ int main(int argc, char **argv) {
// Find the AIE agents in the system.
r = hsa_iterate_agents(get_aie_agents, &aie_agents);
assert(r == HSA_STATUS_SUCCESS);
// assert(hsa_iterate_agents(get_cpu_agents, &aie_agents) ==
// HSA_STATUS_SUCCESS);
assert(aie_agents.size() == 1);

const auto &aie_agent = aie_agents.front();
Expand All @@ -190,14 +189,22 @@ int main(int argc, char **argv) {
assert(r == HSA_STATUS_SUCCESS);
assert(global_kernarg_mem_pool.handle);

// Getting the maximum size of the queue so we can submit that many consecutive
// packets.
uint32_t aie_max_queue_size;
r = hsa_agent_get_info(aie_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &aie_max_queue_size);
assert(r == HSA_STATUS_SUCCESS);
int num_pkts = aie_max_queue_size;

// Load the DPU and PDI files into a global pool that doesn't support kernel
// args (DEV BO).
load_dpu_file(global_dev_mem_pool, dpu_inst_file_name,
reinterpret_cast<void **>(&dpu_inst_buf));
uint32_t dpu_handle = 0;
r = hsa_amd_get_handle_from_vaddr(dpu_inst_buf, &dpu_handle);
uint32_t num_instr;
load_instr_file(global_dev_mem_pool, instr_inst_file_name,
reinterpret_cast<void **>(&instr_inst_buf), num_instr);
uint32_t instr_handle = 0;
r = hsa_amd_get_handle_from_vaddr(instr_inst_buf, &instr_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(dpu_handle != 0);
assert(instr_handle != 0);

load_pdi_file(global_dev_mem_pool, pdi_file_name,
reinterpret_cast<void **>(&pdi_buf));
Expand All @@ -222,85 +229,99 @@ int main(int argc, char **argv) {
constexpr std::size_t data_buffer_size =
num_data_elements * sizeof(std::uint32_t);

std::uint32_t *input = {};
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input));
assert(r == HSA_STATUS_SUCCESS);
std::uint32_t input_handle = {};
r = hsa_amd_get_handle_from_vaddr(input, &input_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(input_handle != 0);

std::uint32_t *output = {};
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&output));
assert(r == HSA_STATUS_SUCCESS);
std::uint32_t output_handle = {};
r = hsa_amd_get_handle_from_vaddr(output, &output_handle);
assert(r == HSA_STATUS_SUCCESS);
assert(output_handle != 0);
std::vector<uint32_t *> input(num_pkts);
std::vector<uint32_t *> output(num_pkts);
std::vector<hsa_amd_aie_ert_start_kernel_data_t *> cmd_payloads(num_pkts);
std::vector<uint32_t> input_handle(num_pkts);
std::vector<uint32_t> output_handle(num_pkts);

uint64_t wr_idx = 0;
uint64_t packet_id = 0;

for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&input[pkt_iter]));
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_get_handle_from_vaddr(input[pkt_iter], &input_handle[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
assert(input_handle[pkt_iter] != 0);

r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, data_buffer_size, 0,
reinterpret_cast<void **>(&output[pkt_iter]));
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_get_handle_from_vaddr(output[pkt_iter], &output_handle[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
assert(output_handle[pkt_iter] != 0);

for (std::size_t i = 0; i < num_data_elements; i++) {
*(input[pkt_iter] + i) = i * (pkt_iter + 1);
*(output[pkt_iter] + i) = 0xDEFACE;
}

for (std::size_t i = 0; i < num_data_elements; i++) {
*(input + i) = i;
*(output + i) = 0xDEFACE;
// Getting a slot in the queue
wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
packet_id = wr_idx % aie_queue->size;

// Creating a packet to store the command
hsa_amd_aie_ert_packet_t *cmd_pkt = static_cast<hsa_amd_aie_ert_packet_t *>(
aie_queue->base_address) + packet_id;
assert(r == HSA_STATUS_SUCCESS);
cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_pkt->count = 0xA; // # of arguments to put in command
cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;

// Creating the payload for the packet
hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_payload));
assert(r == HSA_STATUS_SUCCESS);
// Selecting the PDI to use with this command
cmd_payload->cu_mask = 0x1;
// Transaction opcode
cmd_payload->data[0] = 0x3;
cmd_payload->data[1] = 0x0;
cmd_payload->data[2] = instr_handle;
cmd_payload->data[3] = 0x0;
cmd_payload->data[4] = num_instr;
cmd_payload->data[5] = input_handle[pkt_iter];
cmd_payload->data[6] = 0;
cmd_payload->data[7] = output_handle[pkt_iter];
cmd_payload->data[8] = 0;
cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);

// Keeping track of payloads so we can free them at the end
cmd_payloads[pkt_iter] = cmd_payload;
}

///////////////////////////////////// Creating the cmd packet
// Creating a packet to store the command
hsa_amd_aie_ert_packet_t *cmd_pkt = NULL;
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_pkt));
assert(r == HSA_STATUS_SUCCESS);
cmd_pkt->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_pkt->count = 0xA; // # of arguments to put in command
cmd_pkt->opcode = HSA_AMD_AIE_ERT_START_CU;
cmd_pkt->header.AmdFormat = HSA_AMD_PACKET_TYPE_AIE_ERT;
cmd_pkt->header.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC
<< HSA_PACKET_HEADER_TYPE;

// Creating the payload for the packet
hsa_amd_aie_ert_start_kernel_data_t *cmd_payload = NULL;
uint32_t cmd_handle;
r = hsa_amd_get_handle_from_vaddr(reinterpret_cast<void *>(cmd_pkt),
&cmd_handle);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_allocate(global_kernarg_mem_pool, 64, 0,
reinterpret_cast<void **>(&cmd_payload));
assert(r == HSA_STATUS_SUCCESS);
cmd_payload->cu_mask = 0x1; // Selecting the PDI to use with this command
cmd_payload->data[0] = 0x3; // Transaction opcode
cmd_payload->data[1] = 0x0;
cmd_payload->data[2] = dpu_handle;
cmd_payload->data[3] = 0x0;
cmd_payload->data[4] = 0x44; // Size of DPU instruction
cmd_payload->data[5] = input_handle;
cmd_payload->data[6] = 0;
cmd_payload->data[7] = output_handle;
cmd_payload->data[8] = 0;
cmd_pkt->payload_data = reinterpret_cast<uint64_t>(cmd_payload);

uint64_t wr_idx = hsa_queue_add_write_index_relaxed(aie_queue, 1);
uint64_t packet_id = wr_idx % aie_queue->size;
reinterpret_cast<hsa_amd_aie_ert_packet_t *>(
aie_queue->base_address)[packet_id] = *cmd_pkt;
// Ringing the doorbell to dispatch each packet we added to
// the queue
hsa_signal_store_screlease(aie_queue->doorbell_signal, wr_idx);

for (std::size_t i = 0; i < num_data_elements; i++) {
const auto expected = *(input + i) + 1;
const auto result = *(output + i);
assert(result == expected);
for (int pkt_iter = 0; pkt_iter < num_pkts; pkt_iter++) {
for (std::size_t i = 0; i < num_data_elements; i++) {
const auto expected = *(input[pkt_iter] + i) + 1;
const auto result = *(output[pkt_iter] + i);
assert(result == expected);
}

r = hsa_amd_memory_pool_free(output[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(input[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(cmd_payloads[pkt_iter]);
assert(r == HSA_STATUS_SUCCESS);
}

r = hsa_queue_destroy(aie_queue);
assert(r == HSA_STATUS_SUCCESS);

r = hsa_amd_memory_pool_free(output);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(input);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(pdi_buf);
assert(r == HSA_STATUS_SUCCESS);
r = hsa_amd_memory_pool_free(dpu_inst_buf);
r = hsa_amd_memory_pool_free(instr_inst_buf);
assert(r == HSA_STATUS_SUCCESS);

r = hsa_shut_down();
Expand Down
37 changes: 14 additions & 23 deletions runtime/hsa-runtime/core/runtime/amd_aie_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,13 @@ constexpr int NON_OPERAND_COUNT = 6;
constexpr int DEV_ADDR_BASE = 0x04000000;
constexpr int DEV_ADDR_OFFSET_MASK = 0x02FFFFFF;

// BO size allocated for commands
constexpr int CMD_SIZE = 64;

// This is a temp workaround. For some reason the first command count in a chain
// needs to be a larger than it actually is, assuming there is some other data
// structure at the beginning
// TODO: Look more into this
constexpr int FIRST_CMD_COUNT_SIZE_INCREASE = 5;
// The driver places a structure before each command in a command chain.
// Need to increase the size of the command by the size of this structure.
// In the following xdna driver source can see where this is implemented:
// Commit hash: eddd92c0f61592c576a500f16efa24eb23667c23
// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L387-L391
// https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L637
constexpr int CMD_COUNT_SIZE_INCREASE = 3;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get this size from the driver?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be really nice but not sure the best way to do that. The reason it is 3 is because the driver sets the size of the command in the chain here https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_message.c#L632 which uses the size of this struct https://github.com/amd/xdna-driver/blob/main/src/driver/amdxdna/aie2_msg_priv.h#L388-L392. We don't have a need to have this cmd_chain_slot_execbuf_cf struct in the runtime but can copy it over if we want to get the size. We could be more programmatic with an ioctl but that might be overkill. Thoughts?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love the comment there:

/* Accurate buf size to hint firmware to do necessary copy */

What happens in the FW doesn't take the hint I wonder 🤔

Anyway my thought is it's just a matter of API - xdna should have a more visible API but it doesn't so the only thing we can do is document and leave breadcrumbs. So can you please leave these exact lines in the comment (with the hash, i.e. go to the link and hit y on your keyboard).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Works with me!


// Index of command payload where the instruction sequence
// address is located
Expand Down Expand Up @@ -311,7 +310,7 @@ hsa_status_t AieAqlQueue::CreateCmd(uint32_t size, uint32_t *handle,
// Creating the command
amdxdna_drm_create_bo create_cmd_bo = {};
create_cmd_bo.type = AMDXDNA_BO_CMD,
create_cmd_bo.size = CMD_SIZE;
create_cmd_bo.size = size;
if (ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo))
return HSA_STATUS_ERROR;

Expand Down Expand Up @@ -345,7 +344,6 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Get the payload information
switch (pkt->opcode) {
case HSA_AMD_AIE_ERT_START_CU: {

std::vector<uint32_t> bo_args;
std::vector<uint32_t> cmd_handles;

Expand Down Expand Up @@ -376,23 +374,17 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Creating a packet that contains the command to execute the kernel
uint32_t cmd_bo_handle = 0;
amdxdna_cmd *cmd = nullptr;
if (CreateCmd(64, &cmd_bo_handle, &cmd, fd))
uint32_t cmd_size = sizeof(amdxdna_cmd) + pkt->count * sizeof(uint32_t);
if (CreateCmd(cmd_size, &cmd_bo_handle, &cmd, fd))
return HSA_STATUS_ERROR;

// Filling in the fields of the command
cmd->state = pkt->state;
cmd->extra_cu_masks = 0;

// For some reason the first count needs to be a little larger than
// it actually is, assuming there is some other data structure at the
// beginning
// TODO: Look more into this
if (pkt_iter == cur_id) {
cmd->count = pkt->count + FIRST_CMD_COUNT_SIZE_INCREASE;
}
else {
cmd->count = pkt->count;
}
// The driver places a structure before each command in a command chain.
// Need to increase the size of the command by the size of this structure.
cmd->count = pkt->count + CMD_COUNT_SIZE_INCREASE;
cmd->opcode = pkt->opcode;
cmd->data[0] = cmd_pkt_payload->cu_mask;
memcpy((cmd->data + 1), cmd_pkt_payload->data, 4 * pkt->count);
Expand All @@ -414,8 +406,7 @@ hsa_status_t AieAqlQueue::SubmitCmd(
// Creating a command chain
cmd_chain->state = HSA_AMD_AIE_ERT_STATE_NEW;
cmd_chain->extra_cu_masks = 0;
// TODO: Figure out why this is the value
cmd_chain->count = 0xA;
cmd_chain->count = sizeof(amdxdna_cmd_chain) + cmd_handles.size() * sizeof(uint64_t);
ypapadop-amd marked this conversation as resolved.
Show resolved Hide resolved
cmd_chain->opcode = HSA_AMD_AIE_ERT_CMD_CHAIN;
cmd_chain_payload->command_count = cmd_handles.size();
cmd_chain_payload->submit_index = 0;
Expand Down
Loading