Skip to content

Commit

Permalink
#3943: fixes to completion queue changes after rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
abhullar-tt committed Dec 21, 2023
1 parent 56fae0c commit 61348cf
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ TEST_F(CommandQueueFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {

TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};

EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, *tt::tt_metal::detail::GLOBAL_CQ, config));
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
}

// Test that command queue wraps when buffer available space in completion region is less than a page
Expand All @@ -362,23 +362,23 @@ TEST_F(CommandQueueFixture, TestWrapCompletionQOnInsufficientSpace) {

Buffer buff_1(this->device_, first_buffer_size, large_page_size, BufferType::DRAM);
auto src_1 = local_test_functions::generate_arange_vector(buff_1.size());
EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, buff_1, src_1, false);
EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), buff_1, src_1, false);
vector<uint32_t> result_1;
EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, buff_1, result_1, true);
EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), buff_1, result_1, true);
EXPECT_EQ(src_1, result_1);

Buffer buff_2(this->device_, num_pages_second_buffer * small_page_size, small_page_size, BufferType::DRAM);
auto src_2 = local_test_functions::generate_arange_vector(buff_2.size());
EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, buff_2, src_2, false);
EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), buff_2, src_2, false);
vector<uint32_t> result_2;
EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, buff_2, result_2, true);
EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), buff_2, result_2, true);
EXPECT_EQ(src_2, result_2);

Buffer buff_3(this->device_, 32 * large_page_size, large_page_size, BufferType::DRAM);
auto src_3 = local_test_functions::generate_arange_vector(buff_3.size());
EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, buff_3, src_3, false);
EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), buff_3, src_3, false);
vector<uint32_t> result_3;
EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, buff_3, result_3, true);
EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), buff_3, result_3, true);
EXPECT_EQ(src_3, result_3);
}

Expand All @@ -400,16 +400,16 @@ TEST_F(CommandQueueFixture, TestWrapCompletionQOnInsufficientSpace2) {
uint32_t num_pages_for_wrapping_buffer = (avail_space_for_wrapping_buffer / page_size) + 4;

auto src_1 = local_test_functions::generate_arange_vector(buff_1.size());
EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, buff_1, src_1, false);
EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), buff_1, src_1, false);
vector<uint32_t> result_1;
EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, buff_1, result_1, true);
EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), buff_1, result_1, true);
EXPECT_EQ(src_1, result_1);

Buffer wrap_buff(this->device_, num_pages_for_wrapping_buffer * page_size, page_size, BufferType::DRAM);
auto src_2 = local_test_functions::generate_arange_vector(wrap_buff.size());
EnqueueWriteBuffer(*tt::tt_metal::detail::GLOBAL_CQ, wrap_buff, src_2, false);
EnqueueWriteBuffer(tt::tt_metal::detail::GetCommandQueue(device_), wrap_buff, src_2, false);
vector<uint32_t> result_2;
EnqueueReadBuffer(*tt::tt_metal::detail::GLOBAL_CQ, wrap_buff, result_2, true);
EnqueueReadBuffer(tt::tt_metal::detail::GetCommandQueue(device_), wrap_buff, result_2, true);
EXPECT_EQ(src_2, result_2);
}

Expand Down
4 changes: 2 additions & 2 deletions tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ namespace tt::tt_metal{
};

std::vector<uint32_t> producer_compile_args = {command_issue_region_size};
std::vector<uint32_t> consumer_compile_args = {tt::Cluster::instance().get_tensix_soft_reset_addr(), command_issue_region_size, command_completion_region_size}
std::vector<uint32_t> consumer_compile_args = {tt::Cluster::instance().get_tensix_soft_reset_addr(), command_issue_region_size, command_completion_region_size};

tt::tt_metal::CreateKernel(
dispatch_program,
Expand Down Expand Up @@ -494,7 +494,7 @@ namespace tt::tt_metal{
vector<uint32_t> issue_fifo_addr_vector = {issue_fifo_addr};
tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_ISSUE_READ_PTR, issue_fifo_addr_vector);
tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_ISSUE_WRITE_PTR, issue_fifo_addr_vector);

uint32_t completion_fifo_addr = command_issue_region_size >> 4;
vector<uint32_t> completion_fifo_addr_vector = {completion_fifo_addr};
tt::tt_metal::detail::WriteToDeviceL1(device, consumer_logical_core, CQ_COMPLETION_WRITE_PTR, completion_fifo_addr_vector);
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ bool Device::initialize(const std::vector<uint32_t>& l1_bank_remap) {
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->id_);
tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, mmio_device_id, channel);

detail::SendDispatchKernelToDevice(this, this->sysmem_manager->cq_interface.command_issue_region_size, this->sysmem_manager->cq_interface.command_completion_region_size););
detail::SendDispatchKernelToDevice(this, this->sysmem_manager->cq_interface.command_issue_region_size, this->sysmem_manager->cq_interface.command_completion_region_size);
}

return true;
Expand Down Expand Up @@ -340,7 +340,7 @@ bool Device::close() {
tt::Cluster::instance().l1_barrier(id_);
allocator::clear(*this->allocator_);
if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) {
this->sysmem_writer.reset(nullptr);
this->sysmem_manager.reset(nullptr);
}

this->active_devices_.deactivate_device(this->id_);
Expand Down
17 changes: 10 additions & 7 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,9 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking)
ZoneScopedN("CommandQueue_read_buffer");
TT_FATAL(blocking, "EnqueueReadBuffer only has support for blocking mode currently");

chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id());
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id());

uint32_t read_buffer_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND;

uint32_t padded_page_size = align(buffer.page_size(), 32);
Expand All @@ -824,7 +827,7 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking)
}
tt::log_debug(tt::LogDispatch, "EnqueueReadBuffer");

const uint32_t command_queue_size = this->sysmem_manager.cq_interface.command_queue_size;
const uint32_t command_queue_size = this->device->sysmem_manager->cq_interface.command_queue_size;
uint32_t available_space_bytes = command_queue_size - (get_cq_completion_wr_ptr(this->device->id()) << 4);
if (available_space_bytes < padded_page_size) {
// Wrap the completion region because a single page won't fit in available space
Expand Down Expand Up @@ -860,11 +863,11 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking)
// If page size is not 32B-aligned, we cannot do a contiguous copy
uint32_t dst_address_offset = unpadded_dst_offset;
for (uint32_t sysmem_address_offset = 0; sysmem_address_offset < bytes_read; sysmem_address_offset += padded_page_size) {
tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, 0);
tt::Cluster::instance().read_sysmem((char*)dst + dst_address_offset, buffer.page_size(), command.read_buffer_addr + sysmem_address_offset, mmio_device_id, channel);
dst_address_offset += buffer.page_size();
}
} else {
tt::Cluster::instance().read_sysmem((char*)dst + unpadded_dst_offset, bytes_read, command.read_buffer_addr, 0);
tt::Cluster::instance().read_sysmem((char*)dst + unpadded_dst_offset, bytes_read, command.read_buffer_addr, mmio_device_id, channel);
}

this->device->sysmem_manager->completion_queue_pop_front(bytes_read); // signal to device that data has been read
Expand Down Expand Up @@ -902,7 +905,7 @@ void CommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool bl
uint32_t padded_page_size = align(buffer.page_size(), 32);
uint32_t total_pages_to_write = buffer.num_pages();
uint32_t dst_offset = 0;
const uint32_t command_issue_region_size = this->sysmem_manager.cq_interface.command_issue_region_size;
const uint32_t command_issue_region_size = this->device->sysmem_manager->cq_interface.command_issue_region_size;

auto get_num_pages_to_write = [&total_pages_to_write, &padded_page_size] (uint32_t available_space_bytes) {
uint32_t available_data_space_bytes = available_space_bytes - DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND;
Expand All @@ -916,11 +919,11 @@ void CommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool bl
};

while (total_pages_to_write > 0) {
uint32_t available_space_bytes = command_issue_region_size - this->sysmem_manager.get_issue_queue_write_ptr();
uint32_t available_space_bytes = command_issue_region_size - this->device->sysmem_manager->get_issue_queue_write_ptr();
if (available_space_bytes <= DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND) {
// No space for the command
this->wrap();
available_space_bytes = command_issue_region_size - this->sysmem_manager.get_issue_queue_write_ptr(); // recompute after wrapping
available_space_bytes = command_issue_region_size - this->device->sysmem_manager->get_issue_queue_write_ptr(); // recompute after wrapping
}
tt::log_debug(tt::LogDispatch, "EnqueueWriteBuffer");

Expand Down Expand Up @@ -985,7 +988,7 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) {
if ((this->device->sysmem_manager->get_issue_queue_write_ptr()) + host_data_and_device_command_size >=
this->device->sysmem_manager->cq_interface.command_issue_region_size) {
TT_ASSERT(
host_data_and_device_command_size <= this->sysmem_manager.cq_interface.command_issue_region_size - CQ_START, "EnqueueProgram command size too large");
host_data_and_device_command_size <= this->device->sysmem_manager->cq_interface.command_issue_region_size - CQ_START, "EnqueueProgram command size too large");
this->wrap();
}

Expand Down
5 changes: 3 additions & 2 deletions tt_metal/impl/dispatch/command_queue_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ class SystemMemoryManager {
public:
SystemMemoryCQInterface cq_interface;
SystemMemoryManager(chip_id_t device_id, const std::set<CoreCoord> &dev_dispatch_cores, const std::function<CoreCoord (CoreCoord)> &worker_from_logical) :
cq_interface(tt::Cluster::instance().get_host_channel_size(device_id, tt::Cluster::instance().get_assigned_channel_for_device(device_id))),
cq_interface(
tt::Cluster::instance().get_host_channel_size(tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id))),
device_id(device_id),
m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device_id)),
hugepage_start(
Expand All @@ -93,7 +94,7 @@ class SystemMemoryManager {
tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device_id)),
dispatch_cores(dev_dispatch_cores),
worker_from_logical_callable(worker_from_logical) {

auto dispatch_cores_iter = dispatch_cores.begin();
const std::tuple<uint32_t, uint32_t> producer_tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device_id, this->worker_from_logical_callable(*dispatch_cores_iter++))).value();
auto [producer_tlb_offset, producer_tlb_size] = producer_tlb_data;
Expand Down

0 comments on commit 61348cf

Please sign in to comment.