Skip to content

Commit

Permalink
#3943: don't make issue/completion configurable
Browse files Browse the repository at this point in the history
  • Loading branch information
abhullar-tt committed Dec 21, 2023
1 parent ba2bc8f commit 56fae0c
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,8 @@ TEST_F(CommandQueueFixture, TestPageSizeTooLarge) {

TEST_F(CommandQueueFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
uint32_t page_size = 2048;
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), 0);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel);
uint32_t command_issue_region_size = command_queue_size * 0.75;

uint32_t max_command_size = command_issue_region_size - CQ_START;
Expand All @@ -334,7 +335,8 @@ TEST_F(CommandQueueFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {

TEST_F(CommandQueueFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
uint32_t page_size = 2048;
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), 0);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel);
uint32_t num_pages = command_queue_size / page_size;

TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};
Expand All @@ -348,7 +350,8 @@ TEST_F(CommandQueueFixture, TestWrapCompletionQOnInsufficientSpace) {
uint32_t small_page_size = 2048; // page size for second read

// Using default 75-25 issue and completion queue split
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), 0);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel);
uint32_t command_completion_region_size = command_queue_size * 0.25;

uint32_t first_buffer_size = tt::round_up(command_completion_region_size * 0.95, large_page_size);
Expand Down Expand Up @@ -382,7 +385,8 @@ TEST_F(CommandQueueFixture, TestWrapCompletionQOnInsufficientSpace) {
// Test that command queue wraps when buffer read needs to be split into multiple enqueue_read_buffer commands and available space in completion region is less than a page
TEST_F(CommandQueueFixture, TestWrapCompletionQOnInsufficientSpace2) {
// Using default 75-25 issue and completion queue split
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), 0);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel);
uint32_t command_completion_region_size = command_queue_size * 0.25;

uint32_t num_pages_buff_1 = 9;
Expand Down
10 changes: 4 additions & 6 deletions tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,8 @@ namespace tt::tt_metal{
}

// Sending dispatch kernel. TODO(agrebenisan): Needs a refactor
inline void SendDispatchKernelToDevice(Device *device) {
inline void SendDispatchKernelToDevice(Device *device, uint32_t command_issue_region_size, uint32_t command_completion_region_size) {
ZoneScoped;
// Ideally, this should be some separate API easily accessible in
// TT-metal, don't like the fact that I'm writing this from scratch

Program dispatch_program = CreateProgram();
auto dispatch_cores = device->dispatch_cores().begin();
Expand All @@ -463,8 +461,8 @@ namespace tt::tt_metal{
{"PRODUCER_NOC_Y", std::to_string(producer_physical_core.y)},
};

std::vector<uint32_t> producer_compile_args = {DeviceCommand::COMMAND_ISSUE_REGION_SIZE};
std::vector<uint32_t> consumer_compile_args = {tt::Cluster::instance().get_tensix_soft_reset_addr(), DeviceCommand::COMMAND_ISSUE_REGION_SIZE, DeviceCommand::COMMAND_COMPLETION_REGION_SIZE}
std::vector<uint32_t> producer_compile_args = {command_issue_region_size};
std::vector<uint32_t> consumer_compile_args = {tt::Cluster::instance().get_tensix_soft_reset_addr(), command_issue_region_size, command_completion_region_size}

tt::tt_metal::CreateKernel(
dispatch_program,
Expand Down Expand Up @@ -497,7 +495,7 @@ namespace tt::tt_metal{
tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_ISSUE_READ_PTR, issue_fifo_addr_vector);
tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_ISSUE_WRITE_PTR, issue_fifo_addr_vector);

uint32_t completion_fifo_addr = DeviceCommand::COMMAND_ISSUE_REGION_SIZE >> 4;
uint32_t completion_fifo_addr = command_issue_region_size >> 4;
vector<uint32_t> completion_fifo_addr_vector = {completion_fifo_addr};
tt::tt_metal::detail::WriteToDeviceL1(device, consumer_logical_core, CQ_COMPLETION_WRITE_PTR, completion_fifo_addr_vector);
tt::tt_metal::detail::WriteToDeviceL1(device, consumer_logical_core, CQ_COMPLETION_READ_PTR, completion_fifo_addr_vector);
Expand Down
9 changes: 4 additions & 5 deletions tt_metal/host_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,11 @@ size_t GetNumPCIeDevices();
*
* Return value: Device *
*
* | Argument | Description | Type | Valid Range | Required |
* |---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------|-----------------------------------|-----------------------|
* | device_id | ID of the device to target | chip_id_t (int) | 0 to (GetNumAvailableDevices - 1) | Yes |
* | command_issue_queue_split | Percentage of the command queue that is dedicated for issuing commands. Issue queue size is rounded to be 32B aligned.<br><br>Remaining space is dedicated for completion queue.<br><br>Commands and data being sent to device are pushed to issue queue, data requested from device is pushed to completion queue.<br><br>Smaller issue queues can lead to more stalls due to increased number of wraps to head to fit commands. | std::optional<float> | > 0 | No, defaults to 0.75. |
* | Argument | Description | Type | Valid Range | Required |
* |-----------|----------------------------|-----------------|-----------------------------------|----------|
* | device_id | ID of the device to target | chip_id_t (int) | 0 to (GetNumAvailableDevices - 1) | Yes |
* */
Device *CreateDevice(chip_id_t device_id, std::optional<float> command_issue_queue_split = std::nullopt, const std::vector<uint32_t>& l1_bank_remap = {});
Device *CreateDevice(chip_id_t device_id, const std::vector<uint32_t>& l1_bank_remap = {});

/**
* Resets device and closes device
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,13 +302,13 @@ bool Device::initialize(const std::vector<uint32_t>& l1_bank_remap) {

std::vector<uint32_t> pointers(CQ_START / sizeof(uint32_t), 0);
pointers[HOST_CQ_ISSUE_READ_PTR / sizeof(uint32_t)] = CQ_START >> 4; // HOST_CQ_ISSUE_READ_PTR
pointers[HOST_CQ_COMPLETION_WRITE_PTR / sizeof(uint32_t)] = DeviceCommand::COMMAND_ISSUE_REGION_SIZE >> 4; // HOST_CQ_COMPLETION_WRITE_PTR
pointers[HOST_CQ_COMPLETION_WRITE_PTR / sizeof(uint32_t)] = this->sysmem_manager->cq_interface.command_issue_region_size >> 4; // HOST_CQ_COMPLETION_WRITE_PTR

chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id_);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->id_);
tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, mmio_device_id, channel);

detail::SendDispatchKernelToDevice(this);
detail::SendDispatchKernelToDevice(this, this->sysmem_manager->cq_interface.command_issue_region_size, this->sysmem_manager->cq_interface.command_completion_region_size););
}

return true;
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/dispatch/command_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ namespace detail{

class CommandQueue {
public:
CommandQueue(Device* device, std::optional<float> command_issue_queue_split);
CommandQueue(Device* device);

~CommandQueue();

Expand Down
12 changes: 7 additions & 5 deletions tt_metal/impl/dispatch/command_queue_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,8 @@ struct SystemMemoryCQInterface {
// Equation for issue fifo size is
// | issue_fifo_wr_ptr + command size B - issue_fifo_rd_ptr |
// Space available would just be issue_fifo_limit - issue_fifo_size
SystemMemoryCQInterface(uint32_t command_queue_size, std::optional<float> command_issue_queue_split) : command_queue_size(command_queue_size) {
float issue_queue_split = command_issue_queue_split.has_value() ? command_issue_queue_split.value() : default_issue_queue_split;

this->command_issue_region_size = tt::round_up(command_queue_size * issue_queue_split, 32);
SystemMemoryCQInterface(uint32_t command_queue_size) : command_queue_size(command_queue_size) {
this->command_issue_region_size = tt::round_up(command_queue_size * this->default_issue_queue_split, 32);
this->command_completion_region_size = this->command_queue_size - this->command_issue_region_size;

this->issue_fifo_size = (this->command_issue_region_size - CQ_START) >> 4;
Expand All @@ -51,6 +49,9 @@ struct SystemMemoryCQInterface {
}

const uint32_t command_queue_size;

// Percentage of the command queue that is dedicated for issuing commands. Issue queue size is rounded to be 32B aligned and remaining space is dedicated for completion queue
// Smaller issue queues can lead to more stalls for applications that send more work to device than readback data.
static constexpr float default_issue_queue_split = 0.75;
uint32_t command_issue_region_size;
uint32_t command_completion_region_size;
Expand Down Expand Up @@ -83,8 +84,9 @@ class SystemMemoryManager {
public:
SystemMemoryCQInterface cq_interface;
SystemMemoryManager(chip_id_t device_id, const std::set<CoreCoord> &dev_dispatch_cores, const std::function<CoreCoord (CoreCoord)> &worker_from_logical) :
cq_interface(tt::Cluster::instance().get_host_channel_size(device_id, tt::Cluster::instance().get_assigned_channel_for_device(device_id))),
device_id(device_id),
m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device->id())),
m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device_id)),
hugepage_start(
(char*) tt::Cluster::instance().host_dma_address(0, tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id))),
fast_write_callable(
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/tt_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ size_t GetNumPCIeDevices() {
#endif
}

Device *CreateDevice(chip_id_t device_id, std::optional<float> command_issue_queue_split, const std::vector<uint32_t>& l1_bank_remap) {
Device *CreateDevice(chip_id_t device_id, const std::vector<uint32_t>& l1_bank_remap) {
Device * dev = new Device(device_id, l1_bank_remap);
return dev;
}
Expand Down

0 comments on commit 56fae0c

Please sign in to comment.