Skip to content
This repository has been archived by the owner on Dec 24, 2024. It is now read-only.

Adding soft queue dispatch logic to dispatch commands to AIE agents #2

Merged
merged 6 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion runtime/hsa-runtime/core/driver/xdna/amd_xdna_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ hsa_status_t XdnaDriver::GetAgentProperties(core::Agent &agent) const {
return HSA_STATUS_ERROR;
}

aie_agent.SetNumCols(aie_metadata.cols);
// Right now can only target N-1 columns so putting this
// here as a workaround
Comment on lines +121 to +122
Copy link
Collaborator

@makslevental makslevental Sep 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: why can we target only N-1 columns?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline - this is specific to Phoenix (shim DMA missing from 0th col) and needs to be revisited for strix

aie_agent.SetNumCols(aie_metadata.cols - 1);
aie_agent.SetNumCoreRows(aie_metadata.core.row_count);

return HSA_STATUS_SUCCESS;
Expand Down Expand Up @@ -351,6 +353,16 @@ hsa_status_t XdnaDriver::InitDeviceHeap() {
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings) {
vmem_handle_mappings = this->vmem_handle_mappings;
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::GetFd(int &fd) {
fd = fd_;
return HSA_STATUS_SUCCESS;
}

hsa_status_t XdnaDriver::FreeDeviceHeap() {
if (dev_heap_parent) {
munmap(dev_heap_parent, dev_heap_align * 2 - 1);
Expand Down Expand Up @@ -388,6 +400,13 @@ hsa_status_t XdnaDriver::ConfigHwCtxCU(
config_cu_param.cu_configs[i].cu_config_bo;
xdna_config_cu_param->cu_configs[i].cu_func =
config_cu_param.cu_configs[i].cu_func;

// sync configuration buffer
amdxdna_drm_sync_bo sync_args = {};
sync_args.handle = xdna_config_cu_param->cu_configs[i].cu_bo;
if (ioctl(fd_, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_args) < 0) {
return HSA_STATUS_ERROR;
}
}

amdxdna_drm_config_hwctx config_hw_ctx_args{
Expand Down
76 changes: 71 additions & 5 deletions runtime/hsa-runtime/core/inc/amd_aie_aql_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,35 @@
#include "core/inc/queue.h"
#include "core/inc/runtime.h"
#include "core/inc/signal.h"
#include "core/util/locks.h"

/*
* Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
* amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
*/
struct amdxdna_cmd_chain {
__u32 command_count;
__u32 submit_index;
__u32 error_index;
__u32 reserved[3];
__u64 data[] __counted_by(command_count);
};


/* Exec buffer command header format */
struct amdxdna_cmd {
union {
struct {
__u32 state : 4;
__u32 unused : 6;
__u32 extra_cu_masks : 2;
__u32 count : 11;
__u32 opcode : 5;
__u32 reserved : 4;
};
__u32 header;
};
__u32 data[] __counted_by(count);
};

namespace rocr {
namespace AMD {
Expand All @@ -71,7 +99,7 @@ class AieAqlQueue : public core::Queue,

AieAqlQueue() = delete;
AieAqlQueue(AieAgent *agent, size_t req_size_pkts, uint32_t node_id);
~AieAqlQueue();
~AieAqlQueue() override;

hsa_status_t Inactivate() override;
hsa_status_t SetPriority(HSA_QUEUE_PRIORITY priority) override;
Expand Down Expand Up @@ -100,7 +128,7 @@ class AieAqlQueue : public core::Queue,
void *value) override;

// AIE-specific API
AieAgent &GetAgent() { return agent_; }
AieAgent &GetAgent() const { return agent_; }
void SetHwCtxHandle(uint32_t hw_ctx_handle) {
hw_ctx_handle_ = hw_ctx_handle;
}
Expand All @@ -119,7 +147,7 @@ class AieAqlQueue : public core::Queue,
hsa_fence_scope_t releaseFence = HSA_FENCE_SCOPE_NONE,
hsa_signal_t *signal = NULL) override;

uint32_t queue_id_ = INVALID_QUEUEID;
uint64_t queue_id_ = INVALID_QUEUEID;
/// @brief ID of AIE device on which this queue has been mapped.
uint32_t node_id_ = std::numeric_limits<uint32_t>::max();
/// @brief Queue size in bytes.
Expand All @@ -134,6 +162,44 @@ class AieAqlQueue : public core::Queue,
/// @brief Base of the queue's ring buffer storage.
void *ring_buf_ = nullptr;

static hsa_status_t SubmitCmd(
uint32_t hw_ctx_handle, int fd, void *queue_base,
uint64_t read_dispatch_id, uint64_t write_dispatch_id,
std::unordered_map<uint32_t, void *> &vmem_handle_mappings);

/// @brief Creates a command BO and returns a pointer to the memory and
// the corresponding handle
///
/// @param size size of memory to allocate
/// @param handle A pointer to the BO handle
/// @param cmd A pointer to the buffer
static hsa_status_t CreateCmd(uint32_t size, uint32_t *handle,
amdxdna_cmd **cmd, int fd);

/// @brief Adds all BOs in a command packet payload to a vector
/// and replaces the handles with a virtual address
///
/// @param count Number of entries in the command
/// @param bo_args A pointer to a vector that contains all bo handles
/// @param cmd_pkt_payload A pointer to the payload of the command
static void RegisterCmdBOs(
uint32_t count, std::vector<uint32_t> &bo_args,
hsa_amd_aie_ert_start_kernel_data_t *cmd_pkt_payload,
std::unordered_map<uint32_t, void *> &vmem_handle_mappings);

/// @brief Syncs all BOs referenced in bo_args
///
/// @param bo_args vector containing handles of BOs to sync
static hsa_status_t SyncBos(std::vector<uint32_t> &bo_args, int fd);

/// @brief Executes a command and waits for its completion
///
/// @param exec_cmd Structure containing the details of the command to execute
/// @param hw_ctx_handle the handle of the hardware context to run this
/// command
static hsa_status_t ExecCmdAndWait(amdxdna_drm_exec_cmd *exec_cmd,
uint32_t hw_ctx_handle, int fd);

/// @brief Handle for an application context on the AIE device.
///
/// Each user queue will have an associated context. This handle is assigned
Expand All @@ -153,4 +219,4 @@ class AieAqlQueue : public core::Queue,
} // namespace AMD
} // namespace rocr

#endif // header guard
#endif // HSA_RUNTIME_CORE_INC_AMD_HW_AQL_AIE_COMMAND_PROCESSOR_H_
18 changes: 9 additions & 9 deletions runtime/hsa-runtime/core/inc/amd_gpu_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ class GpuAgent : public GpuAgentInt {
GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xnack_mode, uint32_t index);

// @brief GPU agent destructor.
~GpuAgent();
~GpuAgent() override;

// @brief Ensure blits are ready (performance hint).
void PreloadBlits() override;
Expand Down Expand Up @@ -505,14 +505,14 @@ class GpuAgent : public GpuAgentInt {
hsa_status_t EnableDmaProfiling(bool enable) override;

hsa_status_t PcSamplingIterateConfig(hsa_ven_amd_pcs_iterate_configuration_callback_t cb,
void* cb_data);
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession& session);
void *cb_data) override;
hsa_status_t PcSamplingCreate(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingCreateFromId(HsaPcSamplingTraceId pcsId,
pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession& session);
hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession& session);
pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingDestroy(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingStart(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingStop(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingFlush(pcs::PcsRuntime::PcSamplingSession &session) override;
hsa_status_t PcSamplingFlushHostTrapDeviceBuffers(pcs::PcsRuntime::PcSamplingSession& session);

static void PcSamplingThreadRun(void* agent);
Expand Down Expand Up @@ -787,4 +787,4 @@ class GpuAgent : public GpuAgentInt {
} // namespace amd
} // namespace rocr

#endif // header guard
#endif // HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
8 changes: 4 additions & 4 deletions runtime/hsa-runtime/core/inc/amd_xdna_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

#include "core/inc/driver.h"
#include "core/inc/memory_region.h"
#include "core/driver/xdna/uapi/amdxdna_accel.h"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: shouldn't we be getting this from the kernel somewhere?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a good question. I know @atgutier added this to the runtime but not sure how this is usually done. @atgutier what is the preferred way of doing this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is needed for now to ensure core ROCr can at least build on systems that do not have a XRT installed (e.g., the Gerrit test infra currently). Typically, the installer would place this UAPI (user API) header in a known include directory. When using XRT they put this header here: /usr/src/xrt-amdxdna-2.18.0/include/uapi/drm_local/amdxdna_accel.h.

The solution for now is just to keep a copy of this header here for now to avoid issues where we cannot find it installed globally on the system.

The GPU driver interface also directly includes the kfd_ioctl.h header in the runtime for convenience.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this in the kernel now though? https://patchwork.kernel.org/project/dri-devel/cover/[email protected]/

Or meant to be? Admittedly in my 6.10.7 I only have that header in places that XRT would've installed it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's in the kernel but only for inclusion by the kernel driver. We need it to be installed somewhere accessible by user-mode. So far the only thing that does that is the XRT installer. I confirmed with Max that this is indeed the only way to get the header. I'd prefer not to use that so this is a solution for now.

Eventually, we should get to a point where the driver module installer installs this header.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can make a CMake find_package integration to search for the usual locations of the XDNA driver.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'd need a package to install first. As I said, currently that is only thru XRT which nobody wants to require as a dep here.


namespace rocr {
namespace core {
Expand All @@ -69,6 +70,9 @@ class XdnaDriver : public core::Driver {
hsa_status_t Init() override;
hsa_status_t QueryKernelModeDriver(core::DriverQuery query) override;

hsa_status_t GetHandleMappings(std::unordered_map<uint32_t, void*> &vmem_handle_mappings);
hsa_status_t GetFd(int &fd);

hsa_status_t GetAgentProperties(core::Agent &agent) const override;
hsa_status_t
GetMemoryProperties(uint32_t node_id,
Expand Down Expand Up @@ -126,10 +130,6 @@ class XdnaDriver : public core::Driver {
void *dev_heap_aligned = nullptr;
static constexpr size_t dev_heap_size = 48 * 1024 * 1024;
static constexpr size_t dev_heap_align = 64 * 1024 * 1024;

/// @brief DRM buffer object handle for the device heap. Assigned by the
/// kernel-mode driver.
uint32_t dev_heap_handle = 0;
};

} // namespace AMD
Expand Down
Loading
Loading