diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 89c4d8d1adf..75271befba4 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -65,8 +65,10 @@ run_post_commit_pipeline_tests() { # run_module_tests "$tt_arch" "llrt" "$pipeline_type" if [[ $dispatch_mode == "slow" ]]; then ./tests/scripts/run_pre_post_commit_regressions_slow_dispatch.sh - else + elif [[ $dispatch_mode == "fast" ]]; then ./tests/scripts/run_pre_post_commit_regressions_fast_dispatch.sh + elif [[ $dispatch_mode == "fast-multi-queue-single-device" ]]; then + TT_METAL_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue --gtest_filter=MultiCommandQueueSingleDeviceFixture.* fi } @@ -167,7 +169,6 @@ run_pipeline_tests() { # Add your logic here for pipeline-specific tests echo "Running tests for pipeline: $pipeline_type with tt-arch: $tt_arch" - # Call the appropriate module tests based on pipeline if [[ $pipeline_type == "post_commit" ]]; then run_post_commit_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode" @@ -261,7 +262,7 @@ main() { dispatch_mode=${dispatch_mode:-$default_dispatch_mode} pipeline_type=${pipeline_type:-$default_pipeline_type} - available_dispatch_modes=("fast" "slow") + available_dispatch_modes=("fast" "slow" "fast-multi-queue-single-device") available_tt_archs=("grayskull" "wormhole_b0") # Validate arguments diff --git a/tests/tt_metal/tt_metal/module.mk b/tests/tt_metal/tt_metal/module.mk index 695e083f59e..03d8899863b 100644 --- a/tests/tt_metal/tt_metal/module.mk +++ b/tests/tt_metal/tt_metal/module.mk @@ -1,6 +1,7 @@ include $(TT_METAL_HOME)/tests/tt_metal/tt_metal/unit_tests_common/module.mk include $(TT_METAL_HOME)/tests/tt_metal/tt_metal/unit_tests/module.mk include $(TT_METAL_HOME)/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/module.mk +include $(TT_METAL_HOME)/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/module.mk # Programming examples for external users include $(TT_METAL_HOME)/tt_metal/programming_examples/module.mk @@ -77,7 +78,7 @@ TT_METAL_TESTS_DEPS = $(addprefix $(OBJDIR)/, $(TT_METAL_TESTS_SRCS:.cpp=.d)) -include $(TT_METAL_TESTS_DEPS) # Each module has a top level target as the entrypoint which must match the subdir name -tests/tt_metal: $(TT_METAL_TESTS) programming_examples tests/tt_metal/unit_tests tests/tt_metal/unit_tests_fast_dispatch +tests/tt_metal: $(TT_METAL_TESTS) programming_examples tests/tt_metal/unit_tests tests/tt_metal/unit_tests_fast_dispatch tests/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue tests/tt_metal/all: $(TT_METAL_TESTS) tests/tt_metal/%: $(TESTDIR)/tt_metal/% ; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp index 4566a1d7abc..22e5b96dd2b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp @@ -11,14 +11,14 @@ #include "tt_metal/tools/profiler/op_profiler.hpp" using namespace tt; - +// void measure_latency(string kernel_name) { const int device_id = 0; tt_metal::Device *device = tt_metal::CreateDevice(device_id); - auto dispatch_cores = device->dispatch_cores().begin(); - CoreCoord producer_logical_core = *dispatch_cores++; - CoreCoord consumer_logical_core = *dispatch_cores; + // auto dispatch_cores = device->dispatch_cores().begin(); + CoreCoord producer_logical_core = {0, 0};//*dispatch_cores++; + CoreCoord consumer_logical_core = {0, 0}; //*dispatch_cores; auto first_worker_physical_core = device->worker_core_from_logical_core({0, 0}); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp index 0a56a0ab71f..358dd2f6d24 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp @@ -17,7 +17,8 @@ using namespace tt::tt_metal; namespace host_tests { -TEST_F(MultiCommandQueueFixture, TestAccessCommandQueue) { +namespace multi_device_tests { +TEST_F(CommandQueueMultiDeviceFixture, TestAccessCommandQueue) { for (unsigned int device_id = 0; device_id < num_devices_; device_id++) { EXPECT_NO_THROW(detail::GetCommandQueue(devices_[device_id])); } @@ -36,7 +37,7 @@ TEST(FastDispatchHostSuite, TestCannotAccessCommandQueueForClosedDevice) { EXPECT_ANY_THROW(detail::GetCommandQueue(device)); } -TEST_F(MultiCommandQueueFixture, TestDirectedLoopbackToUniqueHugepage) { +TEST_F(CommandQueueMultiDeviceFixture, TestDirectedLoopbackToUniqueHugepage) { std::unordered_map> golden_data; const uint32_t byte_size = 2048 * 16; @@ -62,5 +63,9 @@ TEST_F(MultiCommandQueueFixture, TestDirectedLoopbackToUniqueHugepage) { EXPECT_EQ(readback_data, golden_data.at(device_id)); } } +} + + + } // namespace host_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 94972fc9400..b24ce534773 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -197,7 +197,6 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded( bool test_EnqueueWrap_on_EnqueueReadBuffer(Device* device, CommandQueue& cq, const TestBufferConfig& config) { auto [buffer, src] = EnqueueWriteBuffer_prior_to_wrap(device, cq, config); - vector dst; EnqueueReadBuffer(cq, buffer, dst, true); @@ -260,7 +259,6 @@ namespace dram_tests { TEST_F(CommandQueueFixture, WriteOneTileToDramBank0) { TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } @@ -329,8 +327,8 @@ TEST_F(CommandQueueFixture, TestWrapHostHugepageOnEnqueueReadBuffer) { uint32_t num_pages = buffer_size / page_size; TestBufferConfig buf_config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM}; - - EXPECT_TRUE(local_test_functions::test_EnqueueWrap_on_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), buf_config)); + CommandQueue a(this->device_, 0); + EXPECT_TRUE(local_test_functions::test_EnqueueWrap_on_EnqueueReadBuffer(this->device_, a, buf_config)); } TEST_F(CommandQueueFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { @@ -420,7 +418,6 @@ namespace l1_tests { TEST_F(CommandQueueFixture, WriteOneTileToL1Bank0) { TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config)); } diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp index a320ac89ee7..847990db9c4 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp @@ -63,7 +63,7 @@ class MultiCommandQueueFixture : public ::testing::Test { size_t num_devices_; }; -class MultiCommandQueueFixture : public ::testing::Test { +class CommandQueueMultiDeviceFixture : public ::testing::Test { protected: void SetUp() override { auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); @@ -73,8 +73,6 @@ class MultiCommandQueueFixture : public ::testing::Test { } arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); - num_devices_ = tt::tt_metal::GetNumAvailableDevices(); - for (unsigned int id = 0; id < num_devices_; id++) { auto* device = tt::tt_metal::CreateDevice(id); devices_.push_back(device); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp new file mode 100644 index 00000000000..606f87351df --- /dev/null +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -0,0 +1,226 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "command_queue_fixture.hpp" +#include "gtest/gtest.h" +#include "tt_metal/host_api.hpp" +#include "tt_metal/test_utils/env_vars.hpp" +#include "tt_metal/detail/tt_metal.hpp" + +using namespace tt::tt_metal; + +struct TestBufferConfig { + uint32_t num_pages; + uint32_t page_size; + BufferType buftype; +}; + +struct BufferStressTestConfig { + // Used for normal write/read tests + uint32_t seed; + uint32_t num_pages_total; + + uint32_t page_size; + uint32_t max_num_pages_per_buffer; + + // Used for wrap test + uint32_t num_iterations; + uint32_t num_unique_vectors; +}; + +namespace local_test_functions { + +vector generate_arange_vector(uint32_t size_bytes) { + TT_FATAL(size_bytes % sizeof(uint32_t) == 0); + vector src(size_bytes / sizeof(uint32_t), 0); + + for (uint32_t i = 0; i < src.size(); i++) { + src.at(i) = i; + } + return src; +} + +bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, vector>& cqs, const TestBufferConfig& config) { + bool pass = true; + for (const bool use_void_star_api: {true, false}) { + + size_t buf_size = config.num_pages * config.page_size; + vector> buffers; + vector> srcs; + for (uint i = 0; i < cqs.size(); i++) { + buffers.push_back(std::make_unique(device, buf_size, config.page_size, config.buftype)); + srcs.push_back(generate_arange_vector(buffers[i]->size())); + if (use_void_star_api) { + EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i].data(), false); + } else { + EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false); + } + } + + for (uint i = 0; i < cqs.size(); i++) { + vector result; + if (use_void_star_api) { + result.resize(buf_size / sizeof(uint32_t)); + EnqueueReadBuffer(cqs[i], *buffers[i], result.data(), true); + } else { + EnqueueReadBuffer(cqs[i], *buffers[i], result, true); + } + bool local_pass = (srcs[i] == result); + pass &= local_pass; + } + } + + return pass; +} +} + + +namespace basic_tests { +namespace dram_tests { + +TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToDramBank0) { + TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllDramBanks) { + TestBufferConfig config = { + .num_pages = uint32_t(this->device_->num_banks(BufferType::DRAM)), + .page_size = 2048, + .buftype = BufferType::DRAM}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { + constexpr uint32_t num_round_robins = 2; + TestBufferConfig config = { + .num_pages = num_round_robins * (this->device_->num_banks(BufferType::DRAM)), + .page_size = 2048, + .buftype = BufferType::DRAM}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, Sending131072Pages) { + // Was a failing case where we used to accidentally program cb num pages to be total + // pages instead of cb num pages. + TestBufferConfig config = { + .num_pages = 131072, + .page_size = 128, + .buftype = BufferType::DRAM}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForDram) { + TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForDram2) { + // From stable diffusion read buffer + TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, TestPageSizeTooLarge) { + if (this->arch_ == tt::ARCH::WORMHOLE_B0) { + GTEST_SKIP(); // This test hanging on wormhole b0 + } + // Should throw a host error due to the page size not fitting in the consumer CB + TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { + uint32_t page_size = 2048; + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); + uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel); + uint32_t num_pages = command_queue_size / page_size; + + TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + + +} // end namespace dram_tests + +namespace l1_tests { + +TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToL1Bank0) { + TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1}; + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllL1Banks) { + auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size(); + TestBufferConfig config = { + .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), + .page_size = 2048, + .buftype = BufferType::L1}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { + auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size(); + TestBufferConfig config = { + .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), + .page_size = 2048, + .buftype = BufferType::L1}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForL1) { + TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1}; + + CommandQueue a(this->device_, 0); + CommandQueue b(this->device_, 1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +} // end namespace l1_tests +} // end namespace basic_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp new file mode 100644 index 00000000000..52ece27ba34 --- /dev/null +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp @@ -0,0 +1,32 @@ +#include "gtest/gtest.h" +#include "tt_metal/host_api.hpp" +#include "tt_metal/test_utils/env_vars.hpp" +#include "tt_metal/impl/dispatch/command_queue.hpp" +#include "tt_metal/llrt/rtoptions.hpp" + +using namespace tt::tt_metal; + +class MultiCommandQueueSingleDeviceFixture : public ::testing::Test { + protected: + void SetUp() override { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + GTEST_SKIP(); + } + auto num_cqs = getenv("TT_METAL_NUM_HW_CQS"); + if (num_cqs == nullptr or strcmp(num_cqs, "2")) { + TT_THROW("This suite must be run with TT_METAL_NUM_HW_CQS=2"); + GTEST_SKIP(); + } + arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); + device_ = tt::tt_metal::CreateDevice(0, {}); + } + + void TearDown() override { + tt::tt_metal::CloseDevice(device_); + } + + tt::tt_metal::Device* device_; + tt::ARCH arch_; +}; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/module.mk b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/module.mk new file mode 100644 index 00000000000..9c79cb97087 --- /dev/null +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/module.mk @@ -0,0 +1,31 @@ +# Every variable in subdir must be prefixed with subdir (emulating a namespace) + +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS_HOME = tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue + +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE = ${TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS_HOME}/tests_main.cpp +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE += $(wildcard ${TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS_HOME}/*/*.cpp) +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE += $(wildcard ${TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS_HOME}/*/*/*.cpp) + +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_OBJ_HOME = tt_metal/tests/unit_tests_fast_dispatch_single_chip_multi_queue/ +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS = $(patsubst $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS_HOME)%, $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_OBJ_HOME)%, $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE)) + +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_INCLUDES = $(TEST_INCLUDES) $(TT_METAL_INCLUDES) -I$(TT_METAL_HOME)/$(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS_HOME)/common +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_LDFLAGS = $(TT_METAL_UNIT_TESTS_LDFLAGS) + +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_OBJS = $(addprefix $(OBJDIR)/, $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS:.cpp=.o)) +TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_DEPS = $(addprefix $(OBJDIR)/, $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS:.cpp=.d)) + +-include $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_DEPS) + +# Each module has a top level target as the entrypoint which must match the subdir name +tests/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue: $(TESTDIR)/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue + +.PRECIOUS: $(TESTDIR)/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue +$(TESTDIR)/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue: $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_OBJS) $(TT_METAL_UNIT_TESTS_COMMON_OBJS) $(TT_METAL_LIB) $(TT_DNN_LIB) + @mkdir -p $(@D) + $(CXX) $(CFLAGS) $(CXXFLAGS) $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_INCLUDES) -o $@ $^ $(LDFLAGS) $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_LDFLAGS) + +.PRECIOUS: $(OBJDIR)/$(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_OBJ_HOME)/%.o +$(OBJDIR)/$(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_OBJ_HOME)/%.o: $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_SRCS_HOME)/%.cpp + @mkdir -p $(@D) + $(CXX) $(CFLAGS) $(CXXFLAGS) $(TT_METAL_UNIT_TESTS_FAST_DISPATCH_SINGLE_CHIP_MULTI_QUEUE_INCLUDES) -c -o $@ $< diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp new file mode 100644 index 00000000000..1e42f41a46c --- /dev/null +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp @@ -0,0 +1,5 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "gtest/gtest.h" diff --git a/tt_metal/common/base.hpp b/tt_metal/common/base.hpp index 0f2c5abb3a3..fc9840240ea 100644 --- a/tt_metal/common/base.hpp +++ b/tt_metal/common/base.hpp @@ -33,6 +33,9 @@ using std::string; using std::size_t; using std::map; +inline uint32_t align(uint32_t addr, uint32_t alignment) { return ((addr - 1) | (alignment - 1)) + 1; } + + namespace tt { diff --git a/tt_metal/common/metal_soc_descriptor.cpp b/tt_metal/common/metal_soc_descriptor.cpp index 3b762ffb42b..35fa6b3b740 100644 --- a/tt_metal/common/metal_soc_descriptor.cpp +++ b/tt_metal/common/metal_soc_descriptor.cpp @@ -308,15 +308,26 @@ void metal_SocDescriptor::load_dispatch_and_banking_config(uint32_t harvesting_m // dispatch_cores are a subset of worker cores // they have already been parsed as CoreType::WORKER and saved into `cores` map when parsing `functional_workers` - for (const auto& core_node : config["dispatch_cores"]) { + for (const auto& core_node : config["producer_cores"]) { RelativeCoreCoord coord = {}; if (core_node.IsSequence()) { // Logical coord coord = RelativeCoreCoord({.x = core_node[0].as(), .y = core_node[1].as()}); } else { - TT_THROW("Only logical relative coords supported for dispatch_cores cores"); + TT_THROW("Only logical relative coords supported for producer_cores cores"); } - this->dispatch_cores.push_back(coord); + this->producer_cores.push_back(coord); + } + + for (const auto& core_node : config["consumer_cores"]) { + RelativeCoreCoord coord = {}; + if (core_node.IsSequence()) { + // Logical coord + coord = RelativeCoreCoord({.x = core_node[0].as(), .y = core_node[1].as()}); + } else { + TT_THROW("Only logical relative coords supported for consumer_cores cores"); + } + this->consumer_cores.push_back(coord); } } diff --git a/tt_metal/common/metal_soc_descriptor.h b/tt_metal/common/metal_soc_descriptor.h index e59dd435d92..9bce58eb36c 100644 --- a/tt_metal/common/metal_soc_descriptor.h +++ b/tt_metal/common/metal_soc_descriptor.h @@ -20,7 +20,8 @@ struct metal_SocDescriptor : public tt_SocDescriptor { CoreCoord compute_with_storage_grid_size; std::vector compute_with_storage_cores; // saved as CoreType::WORKER std::vector storage_cores; // saved as CoreType::WORKER - std::vector dispatch_cores; // saved as CoreType::WORKER + std::vector producer_cores; + std::vector consumer_cores; std::vector logical_ethernet_cores; int l1_bank_size; uint32_t dram_core_size; diff --git a/tt_metal/common/test_common.hpp b/tt_metal/common/test_common.hpp index 717dffa7c56..1d2284d2a6b 100644 --- a/tt_metal/common/test_common.hpp +++ b/tt_metal/common/test_common.hpp @@ -21,8 +21,7 @@ // Needed for TargetDevice enum #include "common/base.hpp" -inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevice target_device, string output_dir = "") { - +inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevice target_device, uint32_t num_cqs, string output_dir = "") { // Ability to skip this runtime opt, since trimmed SOC desc limits which DRAM channels are available. bool use_full_soc_desc = getenv("TT_METAL_VERSIM_FORCE_FULL_SOC_DESC"); string tt_metal_home; @@ -47,9 +46,21 @@ inline std::string get_soc_description_file(const tt::ARCH &arch, tt::TargetDevi switch (arch) { case tt::ARCH::Invalid: throw std::runtime_error("Invalid arch not supported"); // will be overwritten in tt_global_state constructor case tt::ARCH::JAWBRIDGE: throw std::runtime_error("JAWBRIDGE arch not supported"); - case tt::ARCH::GRAYSKULL: return tt_metal_home + "tt_metal/soc_descriptors/grayskull_120_arch.yaml"; + case tt::ARCH::GRAYSKULL: { + if (num_cqs == 1) { + return tt_metal_home + "tt_metal/soc_descriptors/grayskull_120_arch_one_cq.yaml"; + } else if (num_cqs == 2) { + return tt_metal_home + "tt_metal/soc_descriptors/grayskull_120_arch_two_cqs.yaml"; + } + } case tt::ARCH::WORMHOLE: throw std::runtime_error("WORMHOLE arch not supported"); - case tt::ARCH::WORMHOLE_B0: return tt_metal_home + "tt_metal/soc_descriptors/wormhole_b0_80_arch.yaml"; + case tt::ARCH::WORMHOLE_B0: { + if (num_cqs == 1) { + return tt_metal_home + "tt_metal/soc_descriptors/wormhole_b0_80_arch.yaml"; + } else { + TT_THROW("Only one cq currently supported for wormhole"); + } + } default: throw std::runtime_error("Unsupported device arch"); }; } diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp index 82009d2da41..f206ecaad96 100644 --- a/tt_metal/detail/tt_metal.hpp +++ b/tt_metal/detail/tt_metal.hpp @@ -284,9 +284,7 @@ namespace tt::tt_metal{ static std::mutex cq_creation_mutex; { std::lock_guard lock(cq_creation_mutex); - if (not command_queues[id] or (command_queues[id] and command_queues[id]->device != device)) { - command_queues[device->id()] = std::make_unique(device); - } + command_queues[device->id()] = std::make_unique(device, id); } return *(command_queues[id]); } @@ -319,6 +317,7 @@ namespace tt::tt_metal{ inline void GenerateDeviceHeaders(Device *device, const std::string &path) { + // Basic Allocator generates number of banks which may not be power of 2, so we could just pad and alias for now const size_t num_dram_banks = device->num_banks(BufferType::DRAM); const size_t num_dram_banks_pow2 = std::pow(2, std::ceil(std::log2(num_dram_banks))); @@ -360,10 +359,6 @@ namespace tt::tt_metal{ } } - auto dispatch_cores = device->dispatch_cores().begin(); - CoreCoord producer_logical_core = *dispatch_cores++; - CoreCoord consumer_logical_core = *dispatch_cores; - // Create valid PCIe address ranges // This implementation assumes contiguous ranges and aggregates the ranges into one bounds check // TODO: consider checking multiple ranges to detect straddling transactions @@ -373,6 +368,8 @@ namespace tt::tt_metal{ pcie_chan_end_addr += tt::Cluster::instance().get_host_channel_size(device->id(), pcie_chan); } + CoreCoord enqueue_program_dispatch_core = *device->consumer_cores().begin(); + jit_build_genfiles_noc_addr_ranges_header( path, pcie_chan_base_addr, @@ -384,7 +381,7 @@ namespace tt::tt_metal{ soc_d.get_physical_ethernet_cores(), soc_d.grid_size, harvested_rows, - {device->worker_core_from_logical_core(consumer_logical_core)}); + device->worker_core_from_logical_core(enqueue_program_dispatch_core)); } inline void CheckDataMovementConfig(Program &program, const std::string &file_name, const CoreRangeSet &core_ranges) { @@ -445,77 +442,92 @@ namespace tt::tt_metal{ ); } - // Sending dispatch kernel. TODO(agrebenisan): Needs a refactor - inline void SendDispatchKernelToDevice(Device *device, uint32_t command_issue_region_size, uint32_t command_completion_region_size) { + inline void SendDispatchKernelsToDevice(Device *device, const SystemMemoryManager& manager, const uint32_t hugepage_channel) { ZoneScoped; Program dispatch_program = CreateProgram(); - auto dispatch_cores = device->dispatch_cores().begin(); - CoreCoord producer_logical_core = *dispatch_cores++; - CoreCoord consumer_logical_core = *dispatch_cores; - - CoreCoord producer_physical_core = device->worker_core_from_logical_core(producer_logical_core); - CoreCoord consumer_physical_core = device->worker_core_from_logical_core(consumer_logical_core); - - std::map producer_defines = { - {"IS_DISPATCH_KERNEL", ""}, - {"CONSUMER_NOC_X", std::to_string(consumer_physical_core.x)}, - {"CONSUMER_NOC_Y", std::to_string(consumer_physical_core.y)}, - }; - std::map consumer_defines = { - {"PRODUCER_NOC_X", std::to_string(producer_physical_core.x)}, - {"PRODUCER_NOC_Y", std::to_string(producer_physical_core.y)}, - }; - - std::vector producer_compile_args = {command_issue_region_size}; - std::vector consumer_compile_args = {tt::Cluster::instance().get_tensix_soft_reset_addr(), command_issue_region_size, command_completion_region_size}; - - tt::tt_metal::CreateKernel( - dispatch_program, - "tt_metal/impl/dispatch/kernels/command_queue_producer.cpp", - producer_logical_core, - tt::tt_metal::DataMovementConfig { - .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt::tt_metal::NOC::RISCV_0_default, - .compile_args = producer_compile_args, - .defines = producer_defines}); - - tt::tt_metal::CreateKernel( - dispatch_program, - "tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp", - consumer_logical_core, - tt::tt_metal::DataMovementConfig { - .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt::tt_metal::NOC::RISCV_0_default, - .compile_args = consumer_compile_args, - .defines = consumer_defines}); - - tt::tt_metal::CreateSemaphore(dispatch_program, producer_logical_core, 2); - tt::tt_metal::CreateSemaphore(dispatch_program, consumer_logical_core, 0); - - detail::CompileProgram(device, dispatch_program); - tt::tt_metal::detail::ConfigureDeviceWithProgram(device, dispatch_program); - - uint32_t issue_fifo_addr = CQ_START >> 4; - vector issue_fifo_addr_vector = {issue_fifo_addr}; - tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_ISSUE_READ_PTR, issue_fifo_addr_vector); - tt::tt_metal::detail::WriteToDeviceL1(device, producer_logical_core, CQ_ISSUE_WRITE_PTR, issue_fifo_addr_vector); - - uint32_t completion_fifo_addr = command_issue_region_size >> 4; - vector completion_fifo_addr_vector = {completion_fifo_addr}; - tt::tt_metal::detail::WriteToDeviceL1(device, consumer_logical_core, CQ_COMPLETION_WRITE_PTR, completion_fifo_addr_vector); - tt::tt_metal::detail::WriteToDeviceL1(device, consumer_logical_core, CQ_COMPLETION_READ_PTR, completion_fifo_addr_vector); - - tt::Cluster::instance().l1_barrier(device->id()); - - const std::tuple tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device->id(), device->worker_core_from_logical_core(*device->dispatch_cores().begin()))).value(); - auto [tlb_offset, tlb_size] = tlb_data; - - launch_msg_t msg = dispatch_program.kernels_on_core(producer_logical_core)->launch_msg; - // TODO(pkeller): Should use detail::LaunchProgram once we have a mechanism to avoid running all RISCs - tt::llrt::write_launch_msg_to_core(device->id(), producer_physical_core, &msg); - tt::llrt::write_launch_msg_to_core(device->id(), consumer_physical_core, &msg); + const uint32_t cq_size = tt::Cluster::instance().get_host_channel_size(device->id(), hugepage_channel) / device->producer_cores().size(); + TT_ASSERT(device->producer_cores().size() == device->consumer_cores().size(), "There must be the same number of producers as there are consumers"); + TT_ASSERT(device->producer_cores().size() > 0, "There must be at least 1 producer/consumer core"); + TT_ASSERT(device->producer_cores().size() < 3, "There can be at most 2 hardware command queues on a given device"); + + uint8_t cq_channel = 0; + vector dummy; + std::transform( + device->producer_cores().begin(), device->producer_cores().end(), + device->consumer_cores().begin(), std::back_inserter(dummy), + [&device, &dispatch_program, &cq_channel, &cq_size, &manager](const CoreCoord& producer_logical_core, const CoreCoord& consumer_logical_core) { + + CoreCoord producer_physical_core = device->worker_core_from_logical_core(producer_logical_core); + CoreCoord consumer_physical_core = device->worker_core_from_logical_core(consumer_logical_core); + + std::map producer_defines = { + {"CONSUMER_NOC_X", std::to_string(consumer_physical_core.x)}, + {"CONSUMER_NOC_Y", std::to_string(consumer_physical_core.y)}, + }; + std::map consumer_defines = { + {"PRODUCER_NOC_X", std::to_string(producer_physical_core.x)}, + {"PRODUCER_NOC_Y", std::to_string(producer_physical_core.y)}, + }; + + // Address in sysmem for CQ to write back its read ptr to + uint32_t host_issue_queue_read_ptr_addr = HOST_CQ_ISSUE_READ_PTR + cq_channel * cq_size; + uint32_t issue_queue_start_addr = CQ_START + cq_channel * cq_size; + uint32_t issue_queue_size = manager.get_issue_queue_size(cq_channel); + vector producer_compile_args = {host_issue_queue_read_ptr_addr, issue_queue_start_addr, issue_queue_size}; + + uint32_t host_completion_queue_write_ptr_addr = HOST_CQ_COMPLETION_WRITE_PTR + cq_channel * cq_size; + uint32_t completion_queue_start_addr = CQ_START + issue_queue_size + cq_channel * cq_size; + uint32_t completion_queue_size = manager.get_completion_queue_size(cq_channel); + uint32_t host_finish_addr = HOST_CQ_FINISH_PTR + cq_channel * cq_size; + vector consumer_compile_args = {host_completion_queue_write_ptr_addr, completion_queue_start_addr, completion_queue_size, host_finish_addr}; + + tt::tt_metal::CreateKernel( + dispatch_program, + "tt_metal/impl/dispatch/kernels/command_queue_producer.cpp", + producer_logical_core, + tt::tt_metal::DataMovementConfig { + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt::tt_metal::NOC::RISCV_0_default, + .compile_args = producer_compile_args, + .defines = producer_defines}); + + tt::tt_metal::CreateKernel( + dispatch_program, + "tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp", + consumer_logical_core, + tt::tt_metal::DataMovementConfig { + .processor = tt::tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt::tt_metal::NOC::RISCV_0_default, + .compile_args = consumer_compile_args, + .defines = consumer_defines}); + + tt::tt_metal::CreateSemaphore(dispatch_program, producer_logical_core, 2); + tt::tt_metal::CreateSemaphore(dispatch_program, consumer_logical_core, 0); + + CompileProgram(device, dispatch_program); + + ConfigureDeviceWithProgram(device, dispatch_program); + + // The read and write pointers are equal by this point, but the logic may look a bit + // awkward + vector issue_queue_read_ptr = {manager.get_issue_queue_write_ptr(cq_channel) >> 4}; + vector completion_queue_wr_ptr = {manager.get_completion_queue_read_ptr(cq_channel) >> 4}; + + WriteToDeviceL1(device, producer_logical_core, CQ_ISSUE_READ_PTR, issue_queue_read_ptr); + WriteToDeviceL1(device, producer_logical_core, CQ_ISSUE_WRITE_PTR, issue_queue_read_ptr); + WriteToDeviceL1(device, consumer_logical_core, CQ_COMPLETION_READ_PTR, completion_queue_wr_ptr); + WriteToDeviceL1(device, consumer_logical_core, CQ_COMPLETION_WRITE_PTR, completion_queue_wr_ptr); + tt::Cluster::instance().l1_barrier(device->id()); + + launch_msg_t msg = dispatch_program.kernels_on_core(producer_logical_core)->launch_msg; + tt::llrt::write_launch_msg_to_core(device->id(), producer_physical_core, &msg); + tt::llrt::write_launch_msg_to_core(device->id(), consumer_physical_core, &msg); + + cq_channel++; + return 0; + }); } } diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp index 82e7055dfde..5a74210a20f 100644 --- a/tt_metal/host_api.hpp +++ b/tt_metal/host_api.hpp @@ -56,9 +56,9 @@ size_t GetNumPCIeDevices(); * * Return value: Device * * - * | Argument | Description | Type | Valid Range | Required | - * |-----------|----------------------------|-----------------|-----------------------------------|----------| - * | device_id | ID of the device to target | chip_id_t (int) | 0 to (GetNumAvailableDevices - 1) | Yes | + * | Argument | Description | Type | Valid Range | Required | + * |------------|----------------------------|-----------------|-----------------------------------|----------| + * | device_id | ID of the device to target| chip_id_t (int) | 0 to (GetNumAvailableDevices - 1) | Yes | * */ Device *CreateDevice(chip_id_t device_id, const std::vector& l1_bank_remap = {}); diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index 90c7fa3b227..87e7dc82823 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -43,14 +43,18 @@ struct launch_msg_t { // must be cacheline aligned volatile uint16_t ncrisc_watcher_kernel_id; volatile uint16_t triscs_watcher_kernel_id; volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units - volatile uint8_t mode; - volatile uint8_t brisc_noc_id; - volatile uint8_t enable_brisc; - volatile uint8_t enable_ncrisc; - volatile uint8_t enable_triscs; - volatile uint8_t max_cb_index; - volatile uint8_t enable_erisc; - volatile uint8_t run; // must be in last cacheline of this msg + + // TODO(agrebenisan): This must be added in to launch_msg_t + // volatile uint16_t dispatch_core_x; + // volatile uint16_t dispatch_core_y; + volatile uint8_t mode; + volatile uint8_t brisc_noc_id; + volatile uint8_t enable_brisc; + volatile uint8_t enable_ncrisc; + volatile uint8_t enable_triscs; + volatile uint8_t max_cb_index; + volatile uint8_t enable_erisc; + volatile uint8_t run; // must be in last cacheline of this msg }; struct slave_sync_msg_t { diff --git a/tt_metal/hw/inc/mod_div_lib.h b/tt_metal/hw/inc/mod_div_lib.h index e59e64b8ea3..b52c521266f 100644 --- a/tt_metal/hw/inc/mod_div_lib.h +++ b/tt_metal/hw/inc/mod_div_lib.h @@ -33,6 +33,11 @@ inline __attribute__((always_inline)) uint32_t fast_udiv_94(uint32_t n) return (((uint64_t) n * 0xAE4C415D) >> 32) >> 6; } +inline __attribute__((always_inline)) uint32_t fast_udiv_124(uint32_t n) +{ + return (((uint64_t) n * 0x08421085) >> 32) >> 2; +} + template inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n) { @@ -42,6 +47,8 @@ inline __attribute__((always_inline)) uint32_t udivsi3_const_divisor(uint32_t n) } else if constexpr (d == 94) { // fast divide for 94 divisor. Handles Banked L1 address generation for E75 return fast_udiv_94(n); + } else if constexpr (d == 124) { + return fast_udiv_124(n); } else { // generic divide from llvm const unsigned n_uword_bits = sizeof(uint32_t) * CHAR_BIT; diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp index 121bd5049b6..64caa4b376d 100644 --- a/tt_metal/impl/allocator/allocator.cpp +++ b/tt_metal/impl/allocator/allocator.cpp @@ -31,7 +31,7 @@ void validate_num_banks(uint32_t num_banks, const BufferType &buffer_type) { // Dataflow API does not have a working implementation of generic modulo to determine bank_id for interleaved address gen // For non pow2 num banks, special cases need to be added to avoid falling back to generic implementation. // See https://github.com/tenstorrent-metal/tt-metal/issues/3321 - bool custom_mod_bank_id_calculation_exists = (num_banks == 12 or num_banks == 94); + bool custom_mod_bank_id_calculation_exists = (num_banks == 12 or num_banks == 94 or num_banks == 124); bool valid_num_banks = (is_pow2_num_banks or custom_mod_bank_id_calculation_exists); if (not valid_num_banks) { TT_THROW("Invalid number of memory banks for {}. Num banks must be power of 2 or have a dedicated modulo implementation", magic_enum::enum_name(buffer_type)); diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 3431ab35f1b..fe71e52e160 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -90,24 +90,19 @@ void Device::initialize_allocator(const std::vector& l1_bank_remap) { for (const auto& core: soc_desc.physical_cores) { config.core_type_from_noc_coord_table.insert({core.first, AllocCoreType::Invalid}); } - for (const auto& core : soc_desc.compute_with_storage_cores) { - const auto logical_coord = get_core_coord_from_relative(core, this->logical_grid_size()); - this->compute_cores.insert(logical_coord); - const auto noc_coord = this->worker_core_from_logical_core(logical_coord); - config.core_type_from_noc_coord_table[noc_coord] = AllocCoreType::ComputeAndStore; - } - for (const auto& core : soc_desc.storage_cores) { - const auto logical_coord = get_core_coord_from_relative(core, this->logical_grid_size()); - this->storage_only_cores_.insert(logical_coord); - const auto noc_coord = this->worker_core_from_logical_core(logical_coord); - config.core_type_from_noc_coord_table[noc_coord] = AllocCoreType::StorageOnly; - } - for (const auto& core : soc_desc.dispatch_cores) { - const auto logical_coord = get_core_coord_from_relative(core, this->logical_grid_size()); - this->dispatch_cores_.insert(logical_coord); - const auto noc_coord = this->worker_core_from_logical_core(logical_coord); - config.core_type_from_noc_coord_table[noc_coord] = AllocCoreType::Dispatch; - } + + auto extract_core_info_from_soc_desc = [&config, this](AllocCoreType alloc_core_type, const vector& soc_cores, set& cores) { + for (const auto& core : soc_cores) { + const auto logical_coord = get_core_coord_from_relative(core, this->logical_grid_size()); + cores.insert(logical_coord); + const auto noc_coord = this->worker_core_from_logical_core(logical_coord); + config.core_type_from_noc_coord_table[noc_coord] = alloc_core_type; + } + }; + extract_core_info_from_soc_desc(AllocCoreType::ComputeAndStore, soc_desc.compute_with_storage_cores, this->compute_cores_); + extract_core_info_from_soc_desc(AllocCoreType::StorageOnly, soc_desc.storage_cores, this->storage_only_cores_); + extract_core_info_from_soc_desc(AllocCoreType::Dispatch, soc_desc.producer_cores, this->producer_cores_); + extract_core_info_from_soc_desc(AllocCoreType::Dispatch, soc_desc.consumer_cores, this->consumer_cores_); for (const auto &core : soc_desc.get_logical_ethernet_cores()) { this->ethernet_cores_.insert(core); } @@ -291,21 +286,43 @@ bool Device::initialize(const std::vector& l1_bank_remap) { // Create system memory writer for this device to have an associated interface to hardware command queue (i.e. hugepage) if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr) { + vector> dispatch_cores; + std::transform( + this->producer_cores().begin(), this->producer_cores().end(), + this->consumer_cores().begin(), std::back_inserter(dispatch_cores), + [](const CoreCoord& producer_core, const CoreCoord& consumer_core) { + return std::make_pair(producer_core, consumer_core); + }); + this->sysmem_manager = std::make_unique( this->id_, - this->dispatch_cores(), + dispatch_cores, [&, this](CoreCoord core) { return this->worker_core_from_logical_core(core); } ); + // std::cout << "After init device fifo limit: " << this->sysmem_manager->get_issue_queue_limit(0) << std::endl; + // std::cout << "THIS: " << this << std::endl; + // std::cout << "THIS manager: " << this->sysmem_manager.get() << std::endl; + + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->id_); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id_); std::vector pointers(CQ_START / sizeof(uint32_t), 0); - pointers[HOST_CQ_ISSUE_READ_PTR / sizeof(uint32_t)] = CQ_START >> 4; // HOST_CQ_ISSUE_READ_PTR - pointers[HOST_CQ_COMPLETION_WRITE_PTR / sizeof(uint32_t)] = this->sysmem_manager->cq_interface.command_issue_region_size >> 4; // HOST_CQ_COMPLETION_WRITE_PTR + const uint32_t hugepage_size = tt::Cluster::instance().get_host_channel_size(this->id_, channel); - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id_); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->id_); - tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), 0, mmio_device_id, channel); + const char *NUM_HW_CQS = std::getenv("TT_METAL_NUM_HW_CQS"); + uint8_t num_hw_cqs = 1; + if (NUM_HW_CQS != nullptr) { + num_hw_cqs = std::stoi(NUM_HW_CQS); + } + const uint32_t cq_channel_size = hugepage_size / num_hw_cqs; + + for (uint8_t command_queue_channel = 0; command_queue_channel < num_hw_cqs; command_queue_channel++) { + pointers[HOST_CQ_ISSUE_READ_PTR / sizeof(uint32_t)] = (CQ_START + command_queue_channel * cq_channel_size) >> 4; + pointers[HOST_CQ_COMPLETION_WRITE_PTR / sizeof(uint32_t)] = (CQ_START + this->sysmem_manager->get_issue_queue_size(command_queue_channel) + command_queue_channel * cq_channel_size) >> 4; + tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), command_queue_channel * cq_channel_size, mmio_device_id, channel); + } - detail::SendDispatchKernelToDevice(this, this->sysmem_manager->cq_interface.command_issue_region_size, this->sysmem_manager->cq_interface.command_completion_region_size); + detail::SendDispatchKernelsToDevice(this, *this->sysmem_manager, channel); } return true; diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index f0a4e4b7669..366e493fc12 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -129,8 +129,10 @@ class Device { // Set of logical storage only core coordinates const std::set &storage_only_cores() const { return this->storage_only_cores_; } + const std::set &producer_cores() const { return this->producer_cores_; } + const std::set &consumer_cores() const { return this->consumer_cores_; } + // Set of logical dispatch core coordinates - const std::set &dispatch_cores() const { return this->dispatch_cores_; } // Set of logical ethernet core coordinates // core.x represents connectivity to one other chip, i.e. cores with all connect to same chip @@ -188,9 +190,10 @@ class Device { // Allows access to sysmem_writer friend class CommandQueue; - std::set compute_cores; + std::set compute_cores_; std::set storage_only_cores_; - std::set dispatch_cores_; + std::set producer_cores_; + std::set consumer_cores_; std::set ethernet_cores_; }; diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index b7306de1bb4..3f8db708031 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -26,13 +26,11 @@ uint32_t get_noc_multicast_encoding(const CoreCoord& top_left, const CoreCoord& uint32_t get_noc_unicast_encoding(CoreCoord coord) { return NOC_XY_ENCODING(NOC_X(coord.x), NOC_Y(coord.y)); } -uint32_t align(uint32_t addr, uint32_t alignment) { return ((addr - 1) | (alignment - 1)) + 1; } - - -ProgramMap ConstructProgramMap(const Device* device, Program& program) { +ProgramMap ConstructProgramMap(const Device* device, Program& program, const CoreCoord& dispatch_logical_core) { /* TODO(agrebenisan): Move this logic to compile program */ + CoreCoord dispatch_core = device->worker_core_from_logical_core(dispatch_logical_core); vector runtime_arg_page_transfers; vector cb_config_page_transfers; vector program_page_transfers; @@ -277,19 +275,27 @@ ProgramMap ConstructProgramMap(const Device* device, Program& program) { // Since GO signal begin in a new page, I need to advance my idx program_page_idx = align(program_page_idx, DeviceCommand::PROGRAM_PAGE_SIZE / sizeof(uint32_t)); - for (const KernelGroup& kg: program.get_kernel_groups()) { + + // uint32_t dispatch_core_word = ((uint32_t)dispatch_core.y << 16) | dispatch_core.x; + for (KernelGroup& kg: program.get_kernel_groups()) { + // TODO(agrebenisan): Hanging when we extend the launch msg. Needs to be investigated. For now, + // only supporting enqueue program for cq 0 on a device. + // kg.launch_msg.dispatch_core_x = dispatch_core.x; + // kg.launch_msg.dispatch_core_y = dispatch_core.y; uint32_t *launch_message_data = (uint32_t *)&kg.launch_msg; program_pages[program_page_idx] = launch_message_data[0]; program_pages[program_page_idx + 1] = launch_message_data[1]; program_pages[program_page_idx + 2] = launch_message_data[2]; program_pages[program_page_idx + 3] = launch_message_data[3]; - program_page_idx += 4; + // program_pages[program_page_idx + 4] = launch_message_data[4]; + program_page_idx += sizeof(launch_msg_t) / sizeof(uint32_t); } uint32_t num_workers = 0; if (program.logical_cores().find(CoreType::WORKER) != program.logical_cores().end()) { num_workers = program.logical_cores().at(CoreType::WORKER).size(); } + return { .num_workers = num_workers, .program_pages = std::move(program_pages), @@ -306,13 +312,14 @@ ProgramMap ConstructProgramMap(const Device* device, Program& program) { // EnqueueReadBufferCommandSection EnqueueReadBufferCommand::EnqueueReadBufferCommand( + uint32_t command_queue_channel, Device* device, Buffer& buffer, void* dst, SystemMemoryManager& manager, uint32_t src_page_index, std::optional pages_to_read) : - dst(dst), manager(manager), buffer(buffer), src_page_index(src_page_index), pages_to_read(pages_to_read.has_value() ? pages_to_read.value() : buffer.num_pages()) { + command_queue_channel(command_queue_channel), dst(dst), manager(manager), buffer(buffer), src_page_index(src_page_index), pages_to_read(pages_to_read.has_value() ? pages_to_read.value() : buffer.num_pages()) { this->device = device; } @@ -397,27 +404,29 @@ const DeviceCommand EnqueueReadBufferCommand::assemble_device_command(uint32_t d } void EnqueueReadBufferCommand::process() { - uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(); - this->read_buffer_addr = this->manager.get_completion_queue_read_ptr(); + uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_channel); + this->read_buffer_addr = this->manager.get_completion_queue_read_ptr(this->command_queue_channel); const auto cmd = this->assemble_device_command(this->read_buffer_addr); - this->manager.issue_queue_reserve_back(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND); + this->manager.issue_queue_reserve_back(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, this->command_queue_channel); + // std::cout << "Writing read command to " << write_ptr << std::endl; this->manager.cq_write(cmd.get_desc().data(), DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, write_ptr); - this->manager.issue_queue_push_back(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, LAZY_COMMAND_QUEUE_MODE); + this->manager.issue_queue_push_back(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, LAZY_COMMAND_QUEUE_MODE, this->command_queue_channel); } EnqueueCommandType EnqueueReadBufferCommand::type() { return this->type_; } // EnqueueWriteBufferCommand section EnqueueWriteBufferCommand::EnqueueWriteBufferCommand( + uint32_t command_queue_channel, Device* device, Buffer& buffer, const void* src, SystemMemoryManager& manager, uint32_t dst_page_index, std::optional pages_to_write) : - manager(manager), src(src), buffer(buffer), dst_page_index(dst_page_index), pages_to_write(pages_to_write.has_value() ? pages_to_write.value() : buffer.num_pages()) { + command_queue_channel(command_queue_channel), manager(manager), src(src), buffer(buffer), dst_page_index(dst_page_index), pages_to_write(pages_to_write.has_value() ? pages_to_write.value() : buffer.num_pages()) { TT_ASSERT( buffer.buffer_type() == BufferType::DRAM or buffer.buffer_type() == BufferType::L1, "Trying to write to an invalid buffer"); @@ -462,7 +471,7 @@ const DeviceCommand EnqueueWriteBufferCommand::assemble_device_command(uint32_t core_id_y ); } - else{ + else { command.add_buffer_transfer_instruction( src_address, buffer_address, @@ -507,16 +516,17 @@ const DeviceCommand EnqueueWriteBufferCommand::assemble_device_command(uint32_t } void EnqueueWriteBufferCommand::process() { - uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(); + uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_channel); uint32_t system_memory_temporary_storage_address = write_ptr + DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND; const auto cmd = this->assemble_device_command(system_memory_temporary_storage_address); uint32_t data_size_in_bytes = cmd.get_data_size(); uint32_t cmd_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + data_size_in_bytes; - this->manager.issue_queue_reserve_back(cmd_size); - this->manager.cq_write(cmd.get_desc().data(), DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, write_ptr); + this->manager.issue_queue_reserve_back(cmd_size, this->command_queue_channel); + // std::cout << "Writing write buffer command to " << write_ptr << std::endl; + this->manager.cq_write(cmd.get_desc().data(), DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, write_ptr); uint32_t unpadded_src_offset = this->dst_page_index * this->buffer.page_size(); if (this->buffer.page_size() % 32 != 0 and this->buffer.page_size() != this->buffer.size()) { @@ -531,12 +541,13 @@ void EnqueueWriteBufferCommand::process() { this->manager.cq_write((char*)this->src + unpadded_src_offset, data_size_in_bytes, system_memory_temporary_storage_address); } - this->manager.issue_queue_push_back(cmd_size, LAZY_COMMAND_QUEUE_MODE); + this->manager.issue_queue_push_back(cmd_size, LAZY_COMMAND_QUEUE_MODE, this->command_queue_channel); } EnqueueCommandType EnqueueWriteBufferCommand::type() { return this->type_; } EnqueueProgramCommand::EnqueueProgramCommand( + uint32_t command_queue_channel, Device* device, Buffer& buffer, ProgramMap& program_to_dev_map, @@ -544,7 +555,7 @@ EnqueueProgramCommand::EnqueueProgramCommand( const Program& program, bool stall ) : - buffer(buffer), program_to_dev_map(program_to_dev_map), manager(manager), program(program), stall(stall) { + command_queue_channel(command_queue_channel), buffer(buffer), program_to_dev_map(program_to_dev_map), manager(manager), program(program), stall(stall) { this->device = device; } @@ -657,14 +668,14 @@ const DeviceCommand EnqueueProgramCommand::assemble_device_command(uint32_t host } void EnqueueProgramCommand::process() { - uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(); + uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_channel); uint32_t system_memory_temporary_storage_address = write_ptr + DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND; const DeviceCommand cmd = this->assemble_device_command(system_memory_temporary_storage_address); uint32_t data_size_in_bytes = cmd.get_data_size(); const uint32_t cmd_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + data_size_in_bytes; - this->manager.issue_queue_reserve_back(cmd_size); + this->manager.issue_queue_reserve_back(cmd_size, this->command_queue_channel); this->manager.cq_write(cmd.get_desc().data(), DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, write_ptr); uint32_t start_addr = system_memory_temporary_storage_address; @@ -689,13 +700,12 @@ void EnqueueProgramCommand::process() { } } - this->manager.issue_queue_push_back(cmd_size, LAZY_COMMAND_QUEUE_MODE); + this->manager.issue_queue_push_back(cmd_size, LAZY_COMMAND_QUEUE_MODE, this->command_queue_channel); } EnqueueCommandType EnqueueProgramCommand::type() { return this->type_; } -// FinishCommand section -FinishCommand::FinishCommand(Device* device, SystemMemoryManager& manager) : manager(manager) { this->device = device; } +FinishCommand::FinishCommand(uint32_t command_queue_channel, Device* device, SystemMemoryManager& manager) : command_queue_channel(command_queue_channel), manager(manager) { this->device = device; } const DeviceCommand FinishCommand::assemble_device_command(uint32_t) { DeviceCommand command; @@ -704,19 +714,19 @@ const DeviceCommand FinishCommand::assemble_device_command(uint32_t) { } void FinishCommand::process() { - uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(); + uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_channel); const auto cmd = this->assemble_device_command(0); uint32_t cmd_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND; - - this->manager.issue_queue_reserve_back(cmd_size); + this->manager.issue_queue_reserve_back(cmd_size, this->command_queue_channel); + // std::cout << "Writing finish command to " << write_ptr << std::endl; this->manager.cq_write(cmd.get_desc().data(), DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, write_ptr); - this->manager.issue_queue_push_back(cmd_size, false); + this->manager.issue_queue_push_back(cmd_size, false, this->command_queue_channel); } EnqueueCommandType FinishCommand::type() { return this->type_; } // EnqueueWrapCommand section -EnqueueWrapCommand::EnqueueWrapCommand(Device* device, SystemMemoryManager& manager, DeviceCommand::WrapRegion wrap_region) : manager(manager), wrap_region(wrap_region) { +EnqueueWrapCommand::EnqueueWrapCommand(uint32_t command_queue_channel, Device* device, SystemMemoryManager& manager, DeviceCommand::WrapRegion wrap_region) : command_queue_channel(command_queue_channel), manager(manager), wrap_region(wrap_region) { this->device = device; } @@ -726,34 +736,58 @@ const DeviceCommand EnqueueWrapCommand::assemble_device_command(uint32_t) { } void EnqueueWrapCommand::process() { - uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(); - uint32_t space_left_in_bytes = this->manager.cq_interface.command_issue_region_size - write_ptr; + uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_channel); + uint32_t space_left_in_bytes = this->manager.get_issue_queue_limit(this->command_queue_channel) - write_ptr; // There may not be enough space in the issue queue to submit another command // In that case we write as big of a vector as we can with the wrap index (0) set to wrap type // To ensure that the issue queue write pointer does wrap, we need the wrap packet to be the full size of the issue queue - uint32_t wrap_packet_size_bytes = this->wrap_region == DeviceCommand::WrapRegion::ISSUE ? space_left_in_bytes : DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND; + uint32_t wrap_packet_size_bytes = std::min(space_left_in_bytes, DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND); // Since all of the values will be 0, this will be equivalent to // a bunch of NOPs vector command_vector(wrap_packet_size_bytes / sizeof(uint32_t), 0); - command_vector[DeviceCommand::wrap_idx] = (uint32_t)this->wrap_region; // wrap + command_vector[DeviceCommand::wrap_idx] = (uint32_t)this->wrap_region; - this->manager.issue_queue_reserve_back(wrap_packet_size_bytes); + this->manager.issue_queue_reserve_back(wrap_packet_size_bytes, this->command_queue_channel); + // std::cout << "Writing wrap to " << write_ptr << std::endl; this->manager.cq_write(command_vector.data(), command_vector.size() * sizeof(uint32_t), write_ptr); - this->manager.issue_queue_push_back(wrap_packet_size_bytes, LAZY_COMMAND_QUEUE_MODE); + // this->manager.wrap_issue_queue_wr_ptr(this->command_queue_channel); + // this->manager.issue_queue_push_back(wrap_packet_size_bytes, LAZY_COMMAND_QUEUE_MODE, this->command_queue_channel); if (this->wrap_region == DeviceCommand::WrapRegion::COMPLETION) { // Wrap the read pointers for completion queue because device will start writing data at head of completion queue and there are no more reads to be done at current completion queue write pointer // If we don't wrap the read then the subsequent read buffer command may attempt to read past the total command queue size - // because the read buffer command will see updated write pointer to compute num pages to read but the local read pointer is pointing to tail of completion queue - this->manager.wrap_completion_queue_locally(); + // because the read buffer command will see updated write pointer to compute num pages to read but the local read pointer is pointing to tail of completion queue + // std::cout << "Wrap completion locally" << std::endl; + this->manager.wrap_completion_queue_rd_ptr(this->command_queue_channel); + this->manager.issue_queue_push_back(wrap_packet_size_bytes, LAZY_COMMAND_QUEUE_MODE, this->command_queue_channel); + } else { + // std::cout << "Wrap issue locally and remotely" << std::endl; + this->manager.wrap_issue_queue_wr_ptr(this->command_queue_channel); + } } EnqueueCommandType EnqueueWrapCommand::type() { return this->type_; } // CommandQueue section -CommandQueue::CommandQueue(Device* device) { +CommandQueue::CommandQueue(Device* device, uint32_t command_queue_channel): manager(*device->sysmem_manager) { this->device = device; + this->command_queue_channel = command_queue_channel; + + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); + this->command_queue_channel_size = tt::Cluster::instance().get_host_channel_size(device->id(), tt::Cluster::instance().get_assigned_channel_for_device(device->id())) / device->producer_cores().size(); + + uint32_t channel_idx = 0; + const auto& dispatch_cores = device->consumer_cores(); + if (auto it = std::find_if(dispatch_cores.begin(), dispatch_cores.end(), + [&channel_idx, &command_queue_channel](const CoreCoord& coord) { + return (channel_idx++ == command_queue_channel); + } + ); it != dispatch_cores.end()) { + this->dispatch_core = *it; + } else { + TT_THROW("Could not find a dispatch core for the provided channel"); + } } CommandQueue::~CommandQueue() {} @@ -809,56 +843,50 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking) ZoneScopedN("CommandQueue_read_buffer"); TT_FATAL(blocking, "EnqueueReadBuffer only has support for blocking mode currently"); + // std::cout << "Read buffer" << std::endl; + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id()); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id()); - uint32_t read_buffer_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND; uint32_t padded_page_size = align(buffer.page_size(), 32); uint32_t total_pages_to_read = buffer.num_pages(); - uint32_t src_offset = 0; - - uint32_t unpadded_bytes_to_read = buffer.size(); uint32_t unpadded_dst_offset = 0; - + uint32_t src_page_index = 0; while (total_pages_to_read > 0) { - if ((this->device->sysmem_manager->get_issue_queue_write_ptr()) + read_buffer_command_size >= this->device->sysmem_manager->cq_interface.command_issue_region_size) { + if ((this->manager.get_issue_queue_write_ptr(this->command_queue_channel)) + read_buffer_command_size >= this->manager.get_issue_queue_limit(this->command_queue_channel)) { + // std::cout << "issue wrapping" << std::endl; this->wrap(); } - tt::log_debug(tt::LogDispatch, "EnqueueReadBuffer"); - const uint32_t command_queue_size = this->device->sysmem_manager->cq_interface.command_queue_size; - uint32_t available_space_bytes = command_queue_size - (get_cq_completion_wr_ptr(this->device->id()) << 4); - if (available_space_bytes < padded_page_size) { + const uint32_t command_completion_limit = this->manager.get_completion_queue_limit(this->command_queue_channel); + uint32_t num_pages_available = (command_completion_limit - get_cq_completion_wr_ptr(this->device->id(), this->command_queue_channel, this->command_queue_channel_size)) / padded_page_size; + // uint32_t num_pages_available = (command_completion_limit - this->manager.get_completion_queue_write_ptr(this->command_queue_channel)) / padded_page_size; + uint32_t pages_to_read = std::min(total_pages_to_read, num_pages_available); + // std::cout << "padded_page_size " << padded_page_size << std::endl; + // std::cout << "num pages available: " << num_pages_available << ", completion limit: " << command_completion_limit << ", completion wr ptr: " << get_cq_completion_wr_ptr(this->device->id(), this->command_queue_channel, this->command_queue_channel_size) << std::endl; + // std::cout << "pages to read: " << pages_to_read << std::endl; + if (pages_to_read == 0) { // Wrap the completion region because a single page won't fit in available space - // Wrap needs to be blocking because host needs updated write ppointer to compute how many pages can be read + // Wrap needs to be blocking because host needs updated write pointer to compute how many pages can be read + // std::cout << "completion wrapping" << std::endl; this->wrap(DeviceCommand::WrapRegion::COMPLETION, true); - available_space_bytes = command_queue_size - (get_cq_completion_wr_ptr(this->device->id()) << 4); + num_pages_available = (command_completion_limit - get_cq_completion_wr_ptr(this->device->id(), this->command_queue_channel, this->command_queue_channel_size)) / padded_page_size; + // num_pages_available = (command_completion_limit - this->manager.get_completion_queue_write_ptr(this->command_queue_channel)) / padded_page_size; + pages_to_read = std::min(total_pages_to_read, num_pages_available); } - uint32_t pages_to_read; - if (available_space_bytes >= (total_pages_to_read * padded_page_size)) { - pages_to_read = total_pages_to_read; - } else { - pages_to_read = available_space_bytes / padded_page_size; - } - - uint32_t src_page_index = src_offset / padded_page_size; - - EnqueueReadBufferCommand command(this->device, buffer, dst, *this->device->sysmem_manager, src_page_index, pages_to_read); - - // TODO(agrebenisan): Provide support so that we can achieve non-blocking - // For now, make read buffer blocking since after the - // device moves data into the buffer we want to read out - // of, we then need to consume it into a vector. This - // is easiest way to bring this up + tt::log_debug(tt::LogDispatch, "EnqueueReadBuffer for channel {}", this->command_queue_channel); + EnqueueReadBufferCommand command(this->command_queue_channel, this->device, buffer, dst, this->manager, src_page_index, pages_to_read); this->enqueue_command(command, blocking); - - this->device->sysmem_manager->completion_queue_wait_front(); // wait for device to write data + this->manager.completion_queue_wait_front(this->command_queue_channel); // wait for device to write data uint32_t bytes_read = pages_to_read * padded_page_size; - uint32_t unpadded_bytes_read = pages_to_read * buffer.page_size(); - + // std::cout << "Read sysmem" << std::endl; + // std::cout << "Src: " << command.read_buffer_addr << std::endl; + // std::cout << "Dst start: " << (uint64_t)(dst) << std::endl; + // std::cout << "Dst end: " << (uint64_t)((char*) dst + unpadded_dst_offset) << std::endl; + // std::cout << "Completion read pointer: " << this->manager.get_completion_queue_read_ptr(this->command_queue_channel) << std::endl; if ((buffer.page_size() % 32) != 0) { // If page size is not 32B-aligned, we cannot do a contiguous copy uint32_t dst_address_offset = unpadded_dst_offset; @@ -867,14 +895,14 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking) dst_address_offset += buffer.page_size(); } } else { + // std::cout << "bytes read: " << bytes_read << std::endl; tt::Cluster::instance().read_sysmem((char*)dst + unpadded_dst_offset, bytes_read, command.read_buffer_addr, mmio_device_id, channel); } - this->device->sysmem_manager->completion_queue_pop_front(bytes_read); // signal to device that data has been read - + this->manager.completion_queue_pop_front(bytes_read, this->command_queue_channel); total_pages_to_read -= pages_to_read; - src_offset += bytes_read; - unpadded_dst_offset += unpadded_bytes_read; + src_page_index += pages_to_read; + unpadded_dst_offset += pages_to_read * buffer.page_size(); } if (buffer.buffer_layout() == TensorMemoryLayout::WIDTH_SHARDED or @@ -887,6 +915,9 @@ void CommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blocking) } void CommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool blocking) { + + // std::cout << "Enqueue write buffer" << std::endl; + ZoneScopedN("CommandQueue_write_buffer"); TT_FATAL(not blocking, "EnqueueWriteBuffer only has support for non-blocking mode currently"); @@ -904,47 +935,29 @@ void CommandQueue::enqueue_write_buffer(Buffer& buffer, const void* src, bool bl uint32_t padded_page_size = align(buffer.page_size(), 32); uint32_t total_pages_to_write = buffer.num_pages(); - uint32_t dst_offset = 0; - const uint32_t command_issue_region_size = this->device->sysmem_manager->cq_interface.command_issue_region_size; - - auto get_num_pages_to_write = [&total_pages_to_write, &padded_page_size] (uint32_t available_space_bytes) { - uint32_t available_data_space_bytes = available_space_bytes - DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND; - uint32_t pages_to_write; - if (available_data_space_bytes >= (total_pages_to_write * padded_page_size)) { - pages_to_write = total_pages_to_write; - } else { - pages_to_write = available_data_space_bytes / padded_page_size; - } - return pages_to_write; - }; - + const uint32_t command_issue_limit = this->manager.get_issue_queue_limit(this->command_queue_channel); + uint32_t dst_page_index = 0; while (total_pages_to_write > 0) { - uint32_t available_space_bytes = command_issue_region_size - this->device->sysmem_manager->get_issue_queue_write_ptr(); - if (available_space_bytes <= DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND) { - // No space for the command - this->wrap(); - available_space_bytes = command_issue_region_size - this->device->sysmem_manager->get_issue_queue_write_ptr(); // recompute after wrapping - } - tt::log_debug(tt::LogDispatch, "EnqueueWriteBuffer"); + int32_t num_pages_available = (int32_t(command_issue_limit - this->manager.get_issue_queue_write_ptr(this->command_queue_channel)) - int32_t(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND)) / int32_t(padded_page_size); + // If not even a single device command fits, we hit this edgecase + num_pages_available = std::max(num_pages_available, 0); - uint32_t pages_to_write = get_num_pages_to_write(available_space_bytes); - uint32_t write_buffer_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + (pages_to_write * buffer.page_size()); - if (this->device->sysmem_manager->get_issue_queue_write_ptr() + write_buffer_command_size >= command_issue_region_size) { + uint32_t pages_to_write = std::min(total_pages_to_write, (uint32_t)num_pages_available); + // std::cout << "Pages to write: " << pages_to_write << ", issue limit: " << command_issue_limit << ", bytes to write: " << (pages_to_write * padded_page_size) << ", wr ptr: " << this->manager.get_issue_queue_write_ptr(this->command_queue_channel) << std::endl; + if (pages_to_write == 0) { // No space for command and data this->wrap(); - // Recompute after wrapping - available_space_bytes = command_issue_region_size - this->device->sysmem_manager->get_issue_queue_write_ptr(); - pages_to_write = get_num_pages_to_write(available_space_bytes); + num_pages_available = (int32_t(command_issue_limit - this->manager.get_issue_queue_write_ptr(this->command_queue_channel)) - int32_t(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND)) / int32_t(padded_page_size); + pages_to_write = std::min(total_pages_to_write, (uint32_t)num_pages_available); } - uint32_t dst_page_index = dst_offset / padded_page_size; - - EnqueueWriteBufferCommand command(this->device, buffer, src, *this->device->sysmem_manager, dst_page_index, pages_to_write); + tt::log_debug(tt::LogDispatch, "EnqueueWriteBuffer for channel {}", this->command_queue_channel); + EnqueueWriteBufferCommand command(this->command_queue_channel, this->device, buffer, src, this->manager, dst_page_index, pages_to_write); this->enqueue_command(command, blocking); total_pages_to_write -= pages_to_write; - dst_offset += (pages_to_write * padded_page_size); + dst_page_index += pages_to_write; } } @@ -960,9 +973,10 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) { // data is cached, then we don't need to stall, otherwise we need to wait for the // data to land in DRAM first bool stall = false; + // No shared cache so far, can come at a later time if (not this->program_to_buffer.count(program_id)) { stall = true; - ProgramMap program_to_device_map = ConstructProgramMap(this->device, program); + ProgramMap program_to_device_map = ConstructProgramMap(this->device, program, this->dispatch_core); vector& program_pages = program_to_device_map.program_pages; uint32_t program_data_size_in_bytes = program_pages.size() * sizeof(uint32_t); @@ -976,26 +990,38 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) { this->enqueue_write_buffer(*this->program_to_buffer.at(program_id), program_pages.data(), blocking); this->program_to_dev_map.emplace(program_id, std::move(program_to_device_map)); + + const char *READ_BACK_PROGRAMS = std::getenv("TT_METAL_READ_BACK_PROGRAMS"); + if (READ_BACK_PROGRAMS != nullptr) { + tt::log_debug(tt::LogDispatch, "Reading back binary"); + vector read_back; + read_back.resize(program_to_dev_map[program_id].program_pages.size()); + this->enqueue_read_buffer(*this->program_to_buffer.at(program_id), read_back.data(), true); + TT_ASSERT(read_back == program_to_dev_map[program_id].program_pages, "Binary sent to device differs from that read back"); + tt::log_debug(tt::LogDispatch, "Binary matched"); + } } - tt::log_debug(tt::LogDispatch, "EnqueueProgram"); + tt::log_debug(tt::LogDispatch, "EnqueueProgram for channel {}", this->command_queue_channel); uint32_t host_data_num_pages = this->program_to_dev_map.at(program_id).runtime_arg_page_transfers.size() + this->program_to_dev_map.at(program_id).cb_config_page_transfers.size(); uint32_t host_data_and_device_command_size = DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + (host_data_num_pages * DeviceCommand::PROGRAM_PAGE_SIZE); - if ((this->device->sysmem_manager->get_issue_queue_write_ptr()) + host_data_and_device_command_size >= - this->device->sysmem_manager->cq_interface.command_issue_region_size) { + if ((this->manager.get_issue_queue_write_ptr(this->command_queue_channel)) + host_data_and_device_command_size >= + this->manager.get_issue_queue_size(this->command_queue_channel)) { TT_ASSERT( - host_data_and_device_command_size <= this->device->sysmem_manager->cq_interface.command_issue_region_size - CQ_START, "EnqueueProgram command size too large"); + host_data_and_device_command_size <= this->manager.get_issue_queue_size(this->command_queue_channel) - CQ_START, "EnqueueProgram command size too large"); this->wrap(); } - EnqueueProgramCommand command(this->device, + EnqueueProgramCommand command( + this->command_queue_channel, + this->device, *this->program_to_buffer.at(program_id), this->program_to_dev_map.at(program_id), - *this->device->sysmem_manager, + this->manager, program, stall); @@ -1004,22 +1030,23 @@ void CommandQueue::enqueue_program(Program& program, bool blocking) { void CommandQueue::finish() { ZoneScopedN("CommandQueue_finish"); - if ((this->device->sysmem_manager->get_issue_queue_write_ptr()) + DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND >= - this->device->sysmem_manager->cq_interface.command_issue_region_size) { + if ((this->manager.get_issue_queue_write_ptr(this->command_queue_channel)) + DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND >= + this->manager.get_issue_queue_limit(this->command_queue_channel)) { this->wrap(); } - tt::log_debug(tt::LogDispatch, "Finish"); + tt::log_debug(tt::LogDispatch, "Finish for channel {}", this->command_queue_channel); - FinishCommand command(this->device, *this->device->sysmem_manager); + FinishCommand command(this->command_queue_channel, this->device, this->manager); this->enqueue_command(command, false); chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id()); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id()); // We then poll to check that we're done. + uint32_t finish_addr_offset = this->command_queue_channel * this->command_queue_channel_size; uint32_t finish; do { - tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel); + tt::Cluster::instance().read_sysmem(&finish, 4, HOST_CQ_FINISH_PTR + finish_addr_offset, mmio_device_id, channel); // There's also a case where the device can be hung due to an unanswered DPRINT WAIT and // a full print buffer. Poll the print server for this case and throw if it happens. @@ -1027,20 +1054,26 @@ void CommandQueue::finish() { TT_THROW("Command Queue could not finish: device hang due to unanswered DPRINT WAIT."); } } while (finish != 1); - // Reset this value to 0 before moving on finish = 0; - tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR, mmio_device_id, channel); + tt::Cluster::instance().write_sysmem(&finish, 4, HOST_CQ_FINISH_PTR + finish_addr_offset, mmio_device_id, channel); } void CommandQueue::wrap(DeviceCommand::WrapRegion wrap_region, bool blocking) { + // std::cout << "Device addr: " << this->device << std::endl; + // std::cout << "Manager addr: " << &this->manager << std::endl; + // sleep(2); + // if (wrap_region == DeviceCommand::WrapRegion::COMPLETION) { + // std::cout << "Completion wrap" << std::endl; + // } ZoneScopedN("CommandQueue_wrap"); - tt::log_debug(tt::LogDispatch, "EnqueueWrap"); - EnqueueWrapCommand command(this->device, *this->device->sysmem_manager, wrap_region); + tt::log_debug(tt::LogDispatch, "EnqueueWrap for channel {}", this->command_queue_channel); + EnqueueWrapCommand command(this->command_queue_channel, this->device, this->manager, wrap_region); + // std::cout << "On my manager: " << this->manager.get_issue_queue_limit(this->command_queue_channel) << std::endl; + // std::cout << "Issue limit before enqueue: " << command.manager.get_issue_queue_limit(this->command_queue_channel) << std::endl; this->enqueue_command(command, blocking); } -// OpenCL-like APIs void EnqueueReadBuffer(CommandQueue& cq, Buffer& buffer, vector& dst, bool blocking) { // TODO(agrebenisan): Move to deprecated detail::DispatchStateCheck(true); @@ -1070,6 +1103,7 @@ void EnqueueWriteBuffer(CommandQueue& cq, Buffer& buffer, const void* src, bool void EnqueueProgram(CommandQueue& cq, Program& program, bool blocking) { ZoneScoped; + TT_ASSERT(cq.command_queue_channel == 0, "EnqueueProgram only supported on first command queue on device for time being."); detail::DispatchStateCheck(true); detail::CompileProgram(cq.device, program); diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index 266a612c36e..0968906bc1a 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -78,11 +78,13 @@ class EnqueueReadBufferCommand : public Command { uint32_t src_page_index; uint32_t pages_to_read; static constexpr EnqueueCommandType type_ = EnqueueCommandType::ENQUEUE_READ_BUFFER; + uint32_t command_queue_channel; public: Buffer& buffer; uint32_t read_buffer_addr; EnqueueReadBufferCommand( + uint32_t command_queue_channel, Device* device, Buffer& buffer, void* dst, @@ -107,8 +109,10 @@ class EnqueueWriteBufferCommand : public Command { uint32_t dst_page_index; uint32_t pages_to_write; static constexpr EnqueueCommandType type_ = EnqueueCommandType::ENQUEUE_WRITE_BUFFER; + uint32_t command_queue_channel; public: EnqueueWriteBufferCommand( + uint32_t command_queue_channel, Device* device, Buffer& buffer, const void* src, @@ -125,6 +129,7 @@ class EnqueueWriteBufferCommand : public Command { class EnqueueProgramCommand : public Command { private: + uint32_t command_queue_channel; Device* device; Buffer& buffer; ProgramMap& program_to_dev_map; @@ -134,16 +139,14 @@ class EnqueueProgramCommand : public Command { static constexpr EnqueueCommandType type_ = EnqueueCommandType::ENQUEUE_PROGRAM; public: - EnqueueProgramCommand(Device*, Buffer&, ProgramMap&, SystemMemoryManager&, const Program& program, bool stall); + EnqueueProgramCommand(uint32_t command_queue_channel, Device*, Buffer&, ProgramMap&, SystemMemoryManager&, const Program& program, bool stall); - const DeviceCommand assemble_device_command(uint32_t); + const DeviceCommand assemble_device_command(uint32_t src_address); void process(); EnqueueCommandType type(); }; - -// Easiest way for us to process finish is to explicitly have the device // write to address chosen by us for finish... that way we don't need // to mess with checking recv and acked class FinishCommand : public Command { @@ -151,9 +154,10 @@ class FinishCommand : public Command { Device* device; SystemMemoryManager& manager; static constexpr EnqueueCommandType type_ = EnqueueCommandType::FINISH; + uint32_t command_queue_channel; public: - FinishCommand(Device* device, SystemMemoryManager& manager); + FinishCommand(uint32_t command_queue_channel, Device* device, SystemMemoryManager& manager); const DeviceCommand assemble_device_command(uint32_t); @@ -168,9 +172,10 @@ class EnqueueWrapCommand : public Command { SystemMemoryManager& manager; DeviceCommand::WrapRegion wrap_region; static constexpr EnqueueCommandType type_ = EnqueueCommandType::WRAP; + uint32_t command_queue_channel; public: - EnqueueWrapCommand(Device* device, SystemMemoryManager& manager, DeviceCommand::WrapRegion wrap_region); + EnqueueWrapCommand(uint32_t command_queue_channel, Device* device, SystemMemoryManager& manager, DeviceCommand::WrapRegion wrap_region); const DeviceCommand assemble_device_command(uint32_t); @@ -186,11 +191,16 @@ namespace detail{ class CommandQueue { public: - CommandQueue(Device* device); + CommandQueue(Device* device, uint32_t command_queue_channel); ~CommandQueue(); private: + CoreCoord dispatch_core; + uint32_t command_queue_channel; + uint32_t command_queue_channel_size; + SystemMemoryManager& manager; + Device* device; // thread processing_thread; map> diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index 43d9a0a18e9..5082e4d2a68 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -9,22 +9,29 @@ using namespace tt::tt_metal; -inline uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id) { +template +inline uint32_t get_cq_issue_rd_ptr(chip_id_t chip_id, uint32_t cq_channel, uint32_t cq_channel_size) { uint32_t recv; chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); - tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_ISSUE_READ_PTR, mmio_device_id, channel); + tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_ISSUE_READ_PTR + cq_channel * cq_channel_size, mmio_device_id, channel); + if (not addr_16B) { + return recv << 4; + } return recv; } -inline uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id) { +template +inline uint32_t get_cq_completion_wr_ptr(chip_id_t chip_id, uint32_t cq_channel, uint32_t cq_channel_size) { uint32_t recv; chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(chip_id); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(chip_id); - tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_COMPLETION_WRITE_PTR, mmio_device_id, channel); + tt::Cluster::instance().read_sysmem(&recv, sizeof(uint32_t), HOST_CQ_COMPLETION_WRITE_PTR + cq_channel * cq_channel_size, mmio_device_id, channel); + if (not addr_16B) { + return recv << 4; + } return recv; } - struct SystemMemoryCQInterface { // CQ is split into issue and completion regions // Host writes commands and data for H2D transfers in the issue region, device reads from the issue region @@ -32,37 +39,37 @@ struct SystemMemoryCQInterface { // Equation for issue fifo size is // | issue_fifo_wr_ptr + command size B - issue_fifo_rd_ptr | // Space available would just be issue_fifo_limit - issue_fifo_size - SystemMemoryCQInterface(uint32_t command_queue_size) : command_queue_size(command_queue_size) { - this->command_issue_region_size = tt::round_up(command_queue_size * this->default_issue_queue_split, 32); - this->command_completion_region_size = this->command_queue_size - this->command_issue_region_size; - - this->issue_fifo_size = (this->command_issue_region_size - CQ_START) >> 4; - this->issue_fifo_limit = (this->command_issue_region_size >> 4) - 1; - this->issue_fifo_wr_ptr = CQ_START >> 4; // In 16B words - this->issue_fifo_wr_toggle = - 0; // This is used for the edge case where we wrap and our read pointer has not yet moved - - this->completion_fifo_size = this->command_completion_region_size >> 4; - this->completion_fifo_limit = (this->command_queue_size >> 4) - 1; - this->completion_fifo_rd_ptr = this->command_issue_region_size >> 4; // completion region is below issue region + SystemMemoryCQInterface(uint8_t channel, uint32_t channel_size): + command_issue_region_size(tt::round_up((channel_size - CQ_START) * this->default_issue_queue_split, 32)), + command_completion_region_size((channel_size - CQ_START) - this->command_issue_region_size), + issue_fifo_size(command_issue_region_size >> 4), + issue_fifo_limit(((CQ_START + this->command_issue_region_size) + channel * channel_size) >> 4), + completion_fifo_size(command_completion_region_size >> 4), + completion_fifo_limit(issue_fifo_limit + completion_fifo_size), + offset(channel * channel_size) + { + TT_ASSERT(this->issue_fifo_limit != 0, "Cannot have a 0 fifo limit"); + this->issue_fifo_wr_ptr = (CQ_START + this->offset) >> 4; // In 16B words + this->issue_fifo_wr_toggle = 0; + + this->completion_fifo_rd_ptr = this->issue_fifo_limit; this->completion_fifo_rd_toggle = 0; } - const uint32_t command_queue_size; - // Percentage of the command queue that is dedicated for issuing commands. Issue queue size is rounded to be 32B aligned and remaining space is dedicated for completion queue // Smaller issue queues can lead to more stalls for applications that send more work to device than readback data. static constexpr float default_issue_queue_split = 0.75; - uint32_t command_issue_region_size; - uint32_t command_completion_region_size; + const uint32_t command_issue_region_size; + const uint32_t command_completion_region_size; - uint32_t issue_fifo_size; - uint32_t issue_fifo_limit; // Last possible FIFO address + const uint32_t issue_fifo_size; + const uint32_t issue_fifo_limit; // Last possible FIFO address + const uint32_t offset; uint32_t issue_fifo_wr_ptr; bool issue_fifo_wr_toggle; - uint32_t completion_fifo_size; - uint32_t completion_fifo_limit; // Last possible FIFO address + const uint32_t completion_fifo_size; + const uint32_t completion_fifo_limit; // Last possible FIFO address uint32_t completion_fifo_rd_ptr; bool completion_fifo_rd_toggle; }; @@ -70,140 +77,169 @@ struct SystemMemoryCQInterface { class SystemMemoryManager { private: chip_id_t device_id; - // Data required for fast writes to write pointer location - // in prefetch core's L1 - // const std::tuple tlb_data; const uint32_t m_dma_buf_size; const std::function fast_write_callable; - const std::set dispatch_cores; const std::functionworker_from_logical_callable; - uint32_t issue_byte_addr; - uint32_t completion_byte_addr; - char* hugepage_start; + vector issue_byte_addrs; + vector completion_byte_addrs; + char* cq_sysmem_start; + vector> cq_interfaces; + uint32_t cq_channel_size; public: - SystemMemoryCQInterface cq_interface; - SystemMemoryManager(chip_id_t device_id, const std::set &dev_dispatch_cores, const std::function &worker_from_logical) : - cq_interface( - tt::Cluster::instance().get_host_channel_size(tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id))), + SystemMemoryManager(chip_id_t device_id, const std::vector> &cq_cores, const std::function &worker_from_logical) : device_id(device_id), m_dma_buf_size(tt::Cluster::instance().get_m_dma_buf_size(device_id)), - hugepage_start( - (char*) tt::Cluster::instance().host_dma_address(0, tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id))), fast_write_callable( tt::Cluster::instance().get_fast_pcie_static_tlb_write_callable(device_id)), - dispatch_cores(dev_dispatch_cores), worker_from_logical_callable(worker_from_logical) { - auto dispatch_cores_iter = dispatch_cores.begin(); - const std::tuple producer_tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device_id, this->worker_from_logical_callable(*dispatch_cores_iter++))).value(); - auto [producer_tlb_offset, producer_tlb_size] = producer_tlb_data; - this->issue_byte_addr = producer_tlb_offset + CQ_ISSUE_WRITE_PTR % producer_tlb_size; + uint8_t num_hw_cqs = cq_cores.size(); + this->issue_byte_addrs.resize(num_hw_cqs); + this->completion_byte_addrs.resize(num_hw_cqs); + + uint32_t idx = 0; + for (const auto& [producer_core, consumer_core]: cq_cores) { + const std::tuple producer_tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device_id, this->worker_from_logical_callable(producer_core))).value(); + auto [producer_tlb_offset, producer_tlb_size] = producer_tlb_data; + this->issue_byte_addrs[idx] = producer_tlb_offset + CQ_ISSUE_WRITE_PTR % producer_tlb_size; + const std::tuple consumer_tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device_id, this->worker_from_logical_callable(consumer_core))).value(); + auto [consumer_tlb_offset, consumer_tlb_size] = consumer_tlb_data; + this->completion_byte_addrs[idx] = consumer_tlb_offset + CQ_COMPLETION_READ_PTR % consumer_tlb_size; + idx++; + } + + // Split hugepage into however many pieces as there are CQs + uint32_t channel_size = tt::Cluster::instance().get_host_channel_size(device_id, tt::Cluster::instance().get_assigned_channel_for_device(device_id)) / num_hw_cqs; + char* hugepage_start = (char*) tt::Cluster::instance().host_dma_address(0, tt::Cluster::instance().get_associated_mmio_device(device_id), tt::Cluster::instance().get_assigned_channel_for_device(device_id)); + this->cq_sysmem_start = hugepage_start; + + for (uint8_t channel = 0; channel < num_hw_cqs; channel++) { + this->cq_interfaces.push_back(std::make_unique(channel, channel_size)); + } + this->cq_channel_size = channel_size; + } + + uint32_t get_issue_queue_size(const uint8_t channel) const { + return this->cq_interfaces[channel]->issue_fifo_size << 4; + } + + uint32_t get_issue_queue_limit(const uint8_t channel) const { + return this->cq_interfaces[channel]->issue_fifo_limit << 4; + } + + uint32_t get_completion_queue_size(const uint8_t channel) const { + return this->cq_interfaces[channel]->completion_fifo_size << 4; + } - const std::tuple consumer_tlb_data = tt::Cluster::instance().get_tlb_data(tt_cxy_pair(device_id, this->worker_from_logical_callable(*dispatch_cores_iter))).value(); - auto [consumer_tlb_offset, consumer_tlb_size] = consumer_tlb_data; - this->completion_byte_addr = consumer_tlb_offset + CQ_COMPLETION_READ_PTR % consumer_tlb_size; + uint32_t get_completion_queue_limit(const uint8_t channel) const { + return this->cq_interfaces[channel]->completion_fifo_limit << 4; } - uint32_t get_issue_queue_write_ptr() const { - return this->cq_interface.issue_fifo_wr_ptr << 4; + uint32_t get_issue_queue_write_ptr(const uint8_t channel) const { + return this->cq_interfaces[channel]->issue_fifo_wr_ptr << 4; } - uint32_t get_completion_queue_read_ptr() const { - return this->cq_interface.completion_fifo_rd_ptr << 4; + uint32_t get_completion_queue_read_ptr(const uint8_t channel) const { + return this->cq_interfaces[channel]->completion_fifo_rd_ptr << 4; } - void issue_queue_reserve_back(uint32_t cmd_size_B) const { - uint32_t cmd_size_16B = - (((cmd_size_B - 1) | 31) + 1) >> 4; // Terse way to find next multiple of 32 in 16B words + void issue_queue_reserve_back(uint32_t cmd_size_B, const uint8_t channel) const { + uint32_t cmd_size_16B = align(cmd_size_B, 32) >> 4; uint32_t rd_ptr_and_toggle; uint32_t rd_ptr; uint32_t rd_toggle; + const SystemMemoryCQInterface& cq_interface = *this->cq_interfaces[channel]; do { - rd_ptr_and_toggle = get_cq_issue_rd_ptr(this->device_id); + rd_ptr_and_toggle = get_cq_issue_rd_ptr(this->device_id, channel, this->cq_channel_size); rd_ptr = rd_ptr_and_toggle & 0x7fffffff; rd_toggle = rd_ptr_and_toggle >> 31; } while ( - this->cq_interface - .issue_fifo_wr_ptrcq_interface.issue_fifo_wr_ptr + cmd_size_16B> rd_ptr or + cq_interface + .issue_fifo_wr_ptr < rd_ptr and cq_interface.issue_fifo_wr_ptr + cmd_size_16B > rd_ptr or // This is the special case where we wrapped our wr ptr and our rd ptr // has not yet moved - (rd_toggle != this->cq_interface.issue_fifo_wr_toggle and this->cq_interface.issue_fifo_wr_ptr == rd_ptr)); + (rd_toggle != cq_interface.issue_fifo_wr_toggle and cq_interface.issue_fifo_wr_ptr == rd_ptr)); } - // Ideally, data should be an array or pointer, but vector for time-being - // TODO ALMEET: MEASURE THIS void cq_write(const void* data, uint32_t size_in_bytes, uint32_t write_ptr) const { - // There is a 50% overhead if hugepage_start is not made static. - // Eventually when we want to have multiple hugepages, we may need to template - // the sysmem writer to get this optimization. - /*static*/ char* hugepage_start = this->hugepage_start; - void* user_scratchspace = hugepage_start + write_ptr; + void* user_scratchspace = this->cq_sysmem_start + write_ptr; + memcpy(user_scratchspace, data, size_in_bytes); } - void send_issue_queue_write_ptr() const { - static CoreCoord dispatch_core = - this->worker_from_logical_callable(*this->dispatch_cores.begin()); - + void send_issue_queue_write_ptr(const uint8_t channel) const { + const SystemMemoryCQInterface& cq_interface = *this->cq_interfaces[channel]; uint32_t write_ptr_and_toggle = - this->cq_interface.issue_fifo_wr_ptr | (this->cq_interface.issue_fifo_wr_toggle << 31); - this->fast_write_callable(this->issue_byte_addr, 4, (uint8_t*)&write_ptr_and_toggle, this->m_dma_buf_size); + cq_interface.issue_fifo_wr_ptr | (cq_interface.issue_fifo_wr_toggle << 31); + // std::cout << "Sending " << (cq_interface.issue_fifo_wr_ptr << 4) << " to device" << std::endl; + this->fast_write_callable(this->issue_byte_addrs[channel], 4, (uint8_t*)&write_ptr_and_toggle, this->m_dma_buf_size); tt_driver_atomics::sfence(); } - void issue_queue_push_back(uint32_t push_size_B, bool lazy) { + void issue_queue_push_back(uint32_t push_size_B, bool lazy, const uint8_t channel) { // All data needs to be 32B aligned - uint32_t push_size_16B = - (((push_size_B - 1) | 31) + 1) >> 4; // Terse way to find next multiple of 32 in 16B words - this->cq_interface.issue_fifo_wr_ptr += push_size_16B; + uint32_t push_size_16B = align(push_size_B, 32) >> 4; - if (this->cq_interface.issue_fifo_wr_ptr > this->cq_interface.issue_fifo_limit) { - this->cq_interface.issue_fifo_wr_ptr = CQ_START >> 4; + SystemMemoryCQInterface& cq_interface = *this->cq_interfaces[channel]; + + cq_interface.issue_fifo_wr_ptr += push_size_16B; + + if (cq_interface.issue_fifo_wr_ptr >= cq_interface.issue_fifo_limit) { + cq_interface.issue_fifo_wr_ptr -= cq_interface.issue_fifo_size; // Flip the toggle - this->cq_interface.issue_fifo_wr_toggle = not this->cq_interface.issue_fifo_wr_toggle; + cq_interface.issue_fifo_wr_toggle = not cq_interface.issue_fifo_wr_toggle; } // Notify dispatch core if (not lazy) { - this->send_issue_queue_write_ptr(); + this->send_issue_queue_write_ptr(channel); } } - void completion_queue_wait_front() { + void completion_queue_wait_front(const uint8_t channel) { uint32_t write_ptr_and_toggle; uint32_t write_ptr; uint32_t write_toggle; + const SystemMemoryCQInterface& cq_interface = *this->cq_interfaces[channel]; do { - write_ptr_and_toggle = get_cq_completion_wr_ptr(this->device_id); + write_ptr_and_toggle = get_cq_completion_wr_ptr(this->device_id, channel, this->cq_channel_size); write_ptr = write_ptr_and_toggle & 0x7fffffff; write_toggle = write_ptr_and_toggle >> 31; - } while (this->cq_interface.completion_fifo_rd_ptr == write_ptr and this->cq_interface.completion_fifo_rd_toggle == write_toggle); + } while (cq_interface.completion_fifo_rd_ptr == write_ptr and cq_interface.completion_fifo_rd_toggle == write_toggle); } - void send_completion_queue_read_ptr() const { - static CoreCoord dispatch_core = - this->worker_from_logical_callable(*(++this->dispatch_cores.begin())); + void send_completion_queue_read_ptr(const uint8_t channel) const { + const SystemMemoryCQInterface& cq_interface = *this->cq_interfaces[channel]; uint32_t read_ptr_and_toggle = - this->cq_interface.completion_fifo_rd_ptr | (this->cq_interface.completion_fifo_rd_toggle << 31); - this->fast_write_callable(this->completion_byte_addr, 4, (uint8_t*)&read_ptr_and_toggle, this->m_dma_buf_size); + cq_interface.completion_fifo_rd_ptr | (cq_interface.completion_fifo_rd_toggle << 31); + this->fast_write_callable(this->completion_byte_addrs[channel], 4, (uint8_t*)&read_ptr_and_toggle, this->m_dma_buf_size); tt_driver_atomics::sfence(); } - void wrap_completion_queue_locally() { - cq_interface.completion_fifo_rd_ptr = cq_interface.command_issue_region_size >> 4; + void wrap_issue_queue_wr_ptr(const uint8_t channel) { + SystemMemoryCQInterface& cq_interface = *this->cq_interfaces[channel]; + cq_interface.issue_fifo_wr_ptr = (CQ_START + cq_interface.offset) >> 4; + cq_interface.issue_fifo_wr_toggle = not cq_interface.issue_fifo_wr_toggle; + this->send_issue_queue_write_ptr(channel); + } + + void wrap_completion_queue_rd_ptr(const uint8_t channel) { + SystemMemoryCQInterface& cq_interface = *this->cq_interfaces[channel]; + cq_interface.completion_fifo_rd_ptr = cq_interface.issue_fifo_limit; cq_interface.completion_fifo_rd_toggle = not cq_interface.completion_fifo_rd_toggle; } - void completion_queue_pop_front(uint32_t data_read_B) { - uint32_t data_read_16B = - (((data_read_B - 1) | 31) + 1) >> 4; // Terse way to find next multiple of 32 in 16B words + void completion_queue_pop_front(uint32_t data_read_B, const uint8_t channel) { + uint32_t data_read_16B = align(data_read_B, 32) >> 4; + + SystemMemoryCQInterface& cq_interface = *this->cq_interfaces[channel]; cq_interface.completion_fifo_rd_ptr += data_read_16B; if (cq_interface.completion_fifo_rd_ptr >= cq_interface.completion_fifo_limit) { cq_interface.completion_fifo_rd_ptr = cq_interface.command_issue_region_size >> 4; @@ -211,6 +247,6 @@ class SystemMemoryManager { } // Notify dispatch core - this->send_completion_queue_read_ptr(); + this->send_completion_queue_read_ptr(channel); } }; diff --git a/tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp b/tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp index efaa03288ef..72d09a393f1 100644 --- a/tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp +++ b/tt_metal/impl/dispatch/kernels/command_queue_consumer.cpp @@ -3,24 +3,26 @@ // SPDX-License-Identifier: Apache-2.0 #include "tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp" +// #include "debug/dprint.h" // The read interface for the issue region is set up on the device, the write interface belongs to host // Opposite for completion region where device sets up the write interface and host owns read interface -void setup_completion_queue_write_interface(const uint32_t command_issue_region_size, const uint32_t command_completion_region_size) { - uint completion_fifo_addr = command_issue_region_size >> 4; - uint completion_fifo_size = command_completion_region_size >> 4; - - cq_write_interface.completion_fifo_limit = completion_fifo_addr + completion_fifo_size; - cq_write_interface.completion_fifo_wr_ptr = completion_fifo_addr; - cq_write_interface.completion_fifo_size = completion_fifo_size; +void setup_completion_queue_write_interface(const uint32_t completion_region_wr_ptr, const uint32_t completion_region_size) { + cq_write_interface.completion_fifo_wr_ptr = completion_region_wr_ptr >> 4; + cq_write_interface.completion_fifo_size = completion_region_size >> 4; + cq_write_interface.completion_fifo_limit = (completion_region_wr_ptr + completion_region_size) >> 4; cq_write_interface.completion_fifo_wr_toggle = 0; + // DPRINT << "COMPLETION FIFO LIMIT: " << (cq_write_interface.completion_fifo_limit << 4) << ENDL(); } void kernel_main() { - constexpr uint32_t tensix_soft_reset_addr = get_compile_time_arg_val(0); - constexpr uint32_t command_issue_region_size = get_compile_time_arg_val(1); - constexpr uint32_t command_completion_region_size = get_compile_time_arg_val(2); bool db_buf_switch = false; + + constexpr uint32_t host_completion_queue_write_ptr_addr = get_compile_time_arg_val(0); + constexpr uint32_t completion_queue_start_addr = get_compile_time_arg_val(1); + constexpr uint32_t completion_queue_size = get_compile_time_arg_val(2); + constexpr uint32_t host_finish_addr = get_compile_time_arg_val(3); + volatile uint32_t* db_semaphore_addr = reinterpret_cast(SEMAPHORE_BASE); static constexpr uint32_t command_start_addr = L1_UNRESERVED_BASE; // Space between L1_UNRESERVED_BASE -> data_start is for commands @@ -28,11 +30,13 @@ void kernel_main() { uint64_t producer_noc_encoding = uint64_t(NOC_XY_ENCODING(PRODUCER_NOC_X, PRODUCER_NOC_Y)) << 32; uint64_t consumer_noc_encoding = uint64_t(NOC_XY_ENCODING(my_x[0], my_y[0])) << 32; - setup_completion_queue_write_interface(command_issue_region_size, command_completion_region_size); + setup_completion_queue_write_interface(completion_queue_start_addr, completion_queue_size); while (true) { // Wait for producer to supply a command + // DPRINT << "ACQUIRING" << ENDL(); db_acquire(db_semaphore_addr, consumer_noc_encoding); + // DPRINT << "DONE ACQUIRE" << ENDL(); // For each instruction, we need to jump to the relevant part of the device command uint32_t command_start_addr = get_command_slot_addr(db_buf_switch); @@ -53,19 +57,22 @@ void kernel_main() { uint32_t wrap = command_ptr[DeviceCommand::wrap_idx]; if ((DeviceCommand::WrapRegion)wrap == DeviceCommand::WrapRegion::COMPLETION) { - cq_write_interface.completion_fifo_wr_ptr = command_issue_region_size >> 4; // Head to the beginning of the completion region + cq_write_interface.completion_fifo_wr_ptr = completion_queue_start_addr >> 4; // Head to the beginning of the completion region cq_write_interface.completion_fifo_wr_toggle = not cq_write_interface.completion_fifo_wr_toggle; + // DPRINT << "Completion WRAPPING BACK TO " << (cq_write_interface.completion_fifo_wr_ptr << 4) << ENDL(); + // for (volatile int i = 0; i < 10000000; i++); notify_host_of_completion_queue_write_pointer(); } else if (is_program) { write_and_launch_program(program_transfer_start_addr, num_pages, command_ptr, producer_noc_encoding, consumer_cb_size, consumer_cb_num_pages, producer_consumer_transfer_num_pages, db_buf_switch); - wait_for_program_completion(num_workers, tensix_soft_reset_addr); + wait_for_program_completion(num_workers); } else { command_ptr = reinterpret_cast(buffer_transfer_start_addr); - write_buffers(command_ptr, command_issue_region_size, num_buffer_transfers, sharded_buffer_num_cores, consumer_cb_size, consumer_cb_num_pages, producer_noc_encoding, producer_consumer_transfer_num_pages, db_buf_switch); + write_buffers(command_ptr, completion_queue_start_addr, num_buffer_transfers, sharded_buffer_num_cores, consumer_cb_size, consumer_cb_num_pages, producer_noc_encoding, producer_consumer_transfer_num_pages, db_buf_switch); } if (finish) { - notify_host_complete(); + // DPRINT << "GOT FINISH" << ENDL(); + notify_host_complete(); } // notify producer that it has completed a command diff --git a/tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp b/tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp index 8b29977662d..b9b21c45778 100644 --- a/tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp +++ b/tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp @@ -4,6 +4,7 @@ #include "dataflow_api.h" #include "tt_metal/impl/dispatch/kernels/command_queue_common.hpp" +// #include "debug/dprint.h" CQWriteInterface cq_write_interface; @@ -96,8 +97,8 @@ void completion_queue_reserve_back(uint32_t data_size_B) { completion_rd_ptr = completion_rd_ptr_and_toggle & 0x7fffffff; completion_rd_toggle = completion_rd_ptr_and_toggle >> 31; } while ( - cq_write_interface.completion_fifo_wr_ptr < completion_rd_ptr and cq_write_interface.completion_fifo_wr_ptr + data_size_16B > completion_rd_ptr or - (completion_rd_toggle != cq_write_interface.completion_fifo_wr_toggle and cq_write_interface.completion_fifo_wr_ptr == completion_rd_ptr) + (cq_write_interface.completion_fifo_wr_ptr < completion_rd_ptr) and (cq_write_interface.completion_fifo_wr_ptr + data_size_16B > completion_rd_ptr) or + (completion_rd_toggle != cq_write_interface.completion_fifo_wr_toggle) and (cq_write_interface.completion_fifo_wr_ptr == completion_rd_ptr) ); DEBUG_STATUS('N', 'Q', 'R', 'B', 'D'); @@ -105,10 +106,12 @@ void completion_queue_reserve_back(uint32_t data_size_B) { FORCE_INLINE void notify_host_of_completion_queue_write_pointer() { - constexpr static uint64_t pcie_address = (uint64_t(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y)) << 32) | HOST_CQ_COMPLETION_WRITE_PTR; // For now, we are writing to host hugepages at offset + constexpr static uint32_t host_completion_queue_write_ptr_addr = get_compile_time_arg_val(0); + constexpr static uint64_t pcie_address = (uint64_t(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y)) << 32) | host_completion_queue_write_ptr_addr; // For now, we are writing to host hugepages at offset uint32_t completion_wr_ptr_and_toggle = cq_write_interface.completion_fifo_wr_ptr | (cq_write_interface.completion_fifo_wr_toggle << 31); volatile tt_l1_ptr uint32_t* completion_wr_ptr_addr = get_cq_completion_write_ptr(); completion_wr_ptr_addr[0] = completion_wr_ptr_and_toggle; + // DPRINT << "SENDING " << (cq_write_interface.completion_fifo_wr_ptr << 4) << ENDL(); noc_async_write(CQ_COMPLETION_WRITE_PTR, pcie_address, 4); // Consider changing this to be flush instead of barrier // Barrier for now because host reads the completion queue write pointer to determine how many pages can be read @@ -116,12 +119,14 @@ void notify_host_of_completion_queue_write_pointer() { } FORCE_INLINE -void completion_queue_push_back(const uint32_t command_issue_region_size, uint32_t push_size_B) { +void completion_queue_push_back(const uint32_t completion_queue_start_addr, uint32_t push_size_B) { uint32_t push_size_16B = align(push_size_B, 32) >> 4; cq_write_interface.completion_fifo_wr_ptr += push_size_16B; + // DPRINT << "PUSH BACK" << ENDL(); if (cq_write_interface.completion_fifo_wr_ptr >= cq_write_interface.completion_fifo_limit) { - cq_write_interface.completion_fifo_wr_ptr = command_issue_region_size >> 4; + cq_write_interface.completion_fifo_wr_ptr = completion_queue_start_addr >> 4; // Flip the toggle + // DPRINT << "WRAPPING FIFO WR PTR" << ENDL(); cq_write_interface.completion_fifo_wr_toggle = not cq_write_interface.completion_fifo_wr_toggle; } @@ -131,7 +136,7 @@ void completion_queue_push_back(const uint32_t command_issue_region_size, uint32 FORCE_INLINE void write_buffers( volatile tt_l1_ptr uint32_t* command_ptr, - const uint32_t command_issue_region_size, + const uint32_t completion_queue_start_addr, uint32_t num_destinations, uint32_t sharded_buffer_num_cores, uint32_t consumer_cb_size, @@ -184,7 +189,7 @@ FORCE_INLINE void write_buffers( page_id += num_to_write; } if (buffer_type == BufferType::SYSTEM_MEMORY) { - completion_queue_push_back(command_issue_region_size, num_pages * page_size); + completion_queue_push_back(completion_queue_start_addr, num_pages * page_size); } } noc_async_write_barrier(); @@ -301,8 +306,7 @@ void write_and_launch_program( } } -FORCE_INLINE void wait_for_program_completion( - uint32_t num_workers, uint32_t tensix_soft_reset_addr) { +FORCE_INLINE void wait_for_program_completion(uint32_t num_workers) { if (not num_workers) return; @@ -312,17 +316,21 @@ FORCE_INLINE void wait_for_program_completion( volatile tt_l1_ptr uint32_t* message_addr_ptr = reinterpret_cast(DISPATCH_MESSAGE_ADDR); + // DPRINT << "WAIT WORKERS" << ENDL(); while (*message_addr_ptr != num_workers) ; + // DPRINT << "DONE WAIT WORKERS" << ENDL(); + DEBUG_STATUS('Q', 'D'); } +template FORCE_INLINE void notify_host_complete() { volatile tt_l1_ptr uint32_t* finish_ptr = get_cq_finish_ptr(); finish_ptr[0] = 1; constexpr static uint64_t pcie_core_noc_encoding = uint64_t(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y)) << 32; - uint64_t finish_noc_addr = pcie_core_noc_encoding | HOST_CQ_FINISH_PTR; + uint64_t finish_noc_addr = pcie_core_noc_encoding | host_finish_addr; noc_async_write(uint32_t(finish_ptr), finish_noc_addr, 4); noc_async_write_barrier(); finish_ptr[0] = 0; diff --git a/tt_metal/impl/dispatch/kernels/command_queue_producer.cpp b/tt_metal/impl/dispatch/kernels/command_queue_producer.cpp index 2c9966876d1..6f26095332a 100644 --- a/tt_metal/impl/dispatch/kernels/command_queue_producer.cpp +++ b/tt_metal/impl/dispatch/kernels/command_queue_producer.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "tt_metal/impl/dispatch/kernels/command_queue_producer.hpp" +// #include "debug/dprint.h" static constexpr uint32_t COMMAND_START_ADDR = L1_UNRESERVED_BASE; // Space between UNRESERVED_BASE -> data_start is for commands @@ -52,20 +53,20 @@ void program_consumer_cb(bool db_buf_switch, uint64_t consumer_noc_encoding, uin // Only the read interface is set up on the device... the write interface // belongs to host -void setup_issue_queue_read_interface(const uint32_t command_issue_region_size) { - uint issue_fifo_addr = CQ_START >> 4; // The fifo starts after the pointer addresses - uint issue_fifo_size = (command_issue_region_size >> 4) - issue_fifo_addr; - - cq_read_interface.issue_fifo_limit = issue_fifo_addr + issue_fifo_size; - cq_read_interface.issue_fifo_rd_ptr = issue_fifo_addr; - cq_read_interface.issue_fifo_size = issue_fifo_size; +void setup_issue_queue_read_interface(const uint32_t issue_region_rd_ptr, const uint32_t issue_region_size) { + cq_read_interface.issue_fifo_rd_ptr = issue_region_rd_ptr >> 4; + cq_read_interface.issue_fifo_size = issue_region_size >> 4; + cq_read_interface.issue_fifo_limit = (issue_region_rd_ptr + issue_region_size) >> 4; cq_read_interface.issue_fifo_rd_toggle = 0; + // DPRINT << "ISSUE FIFO LIMIT: " << (cq_read_interface.issue_fifo_limit << 4) << ENDL(); } void kernel_main() { - constexpr uint32_t command_issue_region_size = get_compile_time_arg_val(0); + constexpr uint32_t host_issue_queue_read_ptr_addr = get_compile_time_arg_val(0); + constexpr uint32_t issue_queue_start_addr = get_compile_time_arg_val(1); + constexpr uint32_t issue_queue_size = get_compile_time_arg_val(2); - setup_issue_queue_read_interface(command_issue_region_size); + setup_issue_queue_read_interface(issue_queue_start_addr, issue_queue_size); // Initialize the producer/consumer DB semaphore // This represents how many buffers the producer can write to. @@ -78,15 +79,17 @@ void kernel_main() { reinterpret_cast(get_semaphore(0)); // Should be initialized to 2 by host bool db_buf_switch = false; - while (true) { + issue_queue_wait_front(); // Read in command uint32_t rd_ptr = (cq_read_interface.issue_fifo_rd_ptr << 4); uint64_t src_noc_addr = pcie_core_noc_encoding | rd_ptr; - noc_async_read(src_noc_addr, COMMAND_START_ADDR, min(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, command_issue_region_size - rd_ptr)); + noc_async_read(src_noc_addr, COMMAND_START_ADDR, min(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND, issue_queue_size - rd_ptr)); + // DPRINT << "WAIT FOR COMMAND at " << rd_ptr << ENDL(); noc_async_read_barrier(); + // DPRINT << "GOT COMMAND" << ENDL(); // Producer information volatile tt_l1_ptr uint32_t* command_ptr = reinterpret_cast(COMMAND_START_ADDR); @@ -102,10 +105,18 @@ void kernel_main() { uint32_t wrap = command_ptr[DeviceCommand::wrap_idx]; uint32_t producer_consumer_transfer_num_pages = command_ptr[DeviceCommand::producer_consumer_transfer_num_pages_idx]; uint32_t sharded_buffer_num_cores = command_ptr[DeviceCommand::sharded_buffer_num_cores_idx]; + uint32_t finish = command_ptr[DeviceCommand::finish_idx]; + + // if (finish != 0 and finish != 1) { + // DPRINT << "BAD FINISH" << ENDL(); + // while(true); + // } + + // DPRINT << "Finish: " << finish << ENDL(); if ((DeviceCommand::WrapRegion)wrap == DeviceCommand::WrapRegion::ISSUE) { // Basically popfront without the extra conditional - cq_read_interface.issue_fifo_rd_ptr = CQ_START >> 4; // Head to beginning of command queue + cq_read_interface.issue_fifo_rd_ptr = cq_read_interface.issue_fifo_limit - cq_read_interface.issue_fifo_size; // Head to beginning of command queue cq_read_interface.issue_fifo_rd_toggle = not cq_read_interface.issue_fifo_rd_toggle; notify_host_of_issue_queue_read_pointer(); continue; @@ -115,7 +126,9 @@ void kernel_main() { while (db_semaphore_addr[0] == 0) ; // Check that there is space in the consumer program_consumer_cb(db_buf_switch, consumer_noc_encoding, consumer_cb_num_pages, page_size, consumer_cb_size); + // DPRINT << "RELAYING COMMAND" << ENDL(); relay_command(db_buf_switch, consumer_noc_encoding); + // DPRINT << "DONE RELAY" << ENDL(); if (stall) { while (*db_semaphore_addr != 2) ; @@ -128,9 +141,8 @@ void kernel_main() { noc_semaphore_inc(consumer_noc_encoding | get_semaphore(0), 1); noc_async_write_barrier(); // Barrier for now - // Fetch data and send to the consumer - + // DPRINT << "PRODUCING" << ENDL(); produce( command_ptr, num_buffer_transfers, @@ -143,8 +155,10 @@ void kernel_main() { consumer_noc_encoding, producer_consumer_transfer_num_pages, db_buf_switch); + // DPRINT << "DONE PRODUCE" << ENDL(); issue_queue_pop_front(DeviceCommand::NUM_BYTES_IN_DEVICE_COMMAND + data_size); + // DPRINT << "POP FRONT" << ENDL(); db_buf_switch = not db_buf_switch; } diff --git a/tt_metal/impl/dispatch/kernels/command_queue_producer.hpp b/tt_metal/impl/dispatch/kernels/command_queue_producer.hpp index 6dec863d2ab..59ad07a2309 100644 --- a/tt_metal/impl/dispatch/kernels/command_queue_producer.hpp +++ b/tt_metal/impl/dispatch/kernels/command_queue_producer.hpp @@ -5,6 +5,7 @@ #include "tt_metal/impl/dispatch/kernels/command_queue_common.hpp" #include "tt_metal/hostdevcommon/common_values.hpp" #include "risc_attribs.h" +// #include "debug/dprint.h" CQReadInterface cq_read_interface; @@ -13,6 +14,7 @@ inline __attribute__((always_inline)) volatile uint32_t* get_cq_issue_read_ptr() } inline __attribute__((always_inline)) volatile uint32_t* get_cq_issue_write_ptr() { + // DPRINT << "READING ISSUE WRITE POINTER FROM " << CQ_ISSUE_WRITE_PTR << ENDL(); return reinterpret_cast(CQ_ISSUE_WRITE_PTR); } @@ -22,18 +24,20 @@ void issue_queue_wait_front() { uint32_t issue_write_ptr_and_toggle; uint32_t issue_write_ptr; uint32_t issue_write_toggle; + // DPRINT << "WAIT WHERE RD PTR IS " << (cq_read_interface.issue_fifo_rd_ptr << 4) << ", rd toggle: " << cq_read_interface.issue_fifo_rd_toggle << ENDL(); do { issue_write_ptr_and_toggle = *get_cq_issue_write_ptr(); issue_write_ptr = issue_write_ptr_and_toggle & 0x7fffffff; issue_write_toggle = issue_write_ptr_and_toggle >> 31; } while (cq_read_interface.issue_fifo_rd_ptr == issue_write_ptr and cq_read_interface.issue_fifo_rd_toggle == issue_write_toggle); DEBUG_STATUS('N', 'Q', 'D'); + // DPRINT << "Successful wait on " << (cq_read_interface.issue_fifo_rd_ptr << 4) << ", rd toggle: " << cq_read_interface.issue_fifo_rd_toggle << ", wr pointer: " << (issue_write_ptr << 4) << ENDL(); } FORCE_INLINE void notify_host_of_issue_queue_read_pointer() { // These are the PCIE core coordinates - constexpr static uint64_t pcie_address = (uint64_t(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y)) << 32) | HOST_CQ_ISSUE_READ_PTR; // For now, we are writing to host hugepages at offset + constexpr static uint64_t pcie_address = (uint64_t(NOC_XY_ENCODING(PCIE_NOC_X, PCIE_NOC_Y)) << 32) | get_compile_time_arg_val(0); // For now, we are writing to host hugepages at offset uint32_t issue_rd_ptr_and_toggle = cq_read_interface.issue_fifo_rd_ptr | (cq_read_interface.issue_fifo_rd_toggle << 31);; volatile tt_l1_ptr uint32_t* issue_rd_ptr_addr = get_cq_issue_read_ptr(); issue_rd_ptr_addr[0] = issue_rd_ptr_and_toggle; @@ -49,6 +53,11 @@ void issue_queue_pop_front(uint32_t cmd_size_B) { uint32_t cmd_size_16B = align(cmd_size_B, 32) >> 4; cq_read_interface.issue_fifo_rd_ptr += cmd_size_16B; + if (cq_read_interface.issue_fifo_rd_ptr >= cq_read_interface.issue_fifo_limit) { + cq_read_interface.issue_fifo_rd_ptr -= cq_read_interface.issue_fifo_size; + cq_read_interface.issue_fifo_rd_toggle = not cq_read_interface.issue_fifo_rd_toggle; + } + notify_host_of_issue_queue_read_pointer(); } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index adec3686e05..4344ebfe457 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -463,7 +463,7 @@ void Program::validate_circular_buffer_region(const Device *device) const { // Banks are in lockstep so we only need to get lowest L1 address of one compute and storage core // Only compute with storage cores can have CBs and all compute with storage cores will have the same bank offset - const std::vector &bank_ids = device->bank_ids_from_logical_core(*device->compute_cores.begin()); + const std::vector &bank_ids = device->bank_ids_from_logical_core(*device->compute_cores_.begin()); std::optional lowest_address = allocator::lowest_occupied_l1_address(*device->allocator_, bank_ids[0]); uint32_t max_l1_size = device->l1_size_per_core(); diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp index 1d2506f2d64..4831bd8dff8 100644 --- a/tt_metal/jit_build/genfiles.cpp +++ b/tt_metal/jit_build/genfiles.cpp @@ -494,9 +494,7 @@ static string generate_noc_addr_ranges_string( const std::vector& ethernet_cores, CoreCoord grid_size, const std::vector& harvested_rows, - const vector& dispatch_cores) { - - TT_ASSERT(dispatch_cores.size() == 1, "Only 1 dispatch core supported so far"); + const CoreCoord& enqueue_program_physical_dispatch_core) { stringstream ss; @@ -573,10 +571,10 @@ static string generate_noc_addr_ranges_string( ss << " (y) <= NOC_Y((uint32_t)" << 1 << ") && \\" << endl; ss << " (y) >= NOC_Y((uint32_t)" << grid_size.y - 1<< "))))"; ss << endl; - ss << endl; - ss << "#define DISPATCH_CORE_X " << dispatch_cores[0].x << endl; - ss << "#define DISPATCH_CORE_Y " << dispatch_cores[0].y << endl; + + ss << "#define DISPATCH_CORE_X " << enqueue_program_physical_dispatch_core.x << endl; + ss << "#define DISPATCH_CORE_Y " << enqueue_program_physical_dispatch_core.y << endl; return ss.str(); } @@ -592,10 +590,10 @@ void jit_build_genfiles_noc_addr_ranges_header( const std::vector& ethernet_cores, CoreCoord grid_size, const std::vector& harvested_rows, - const vector& dispatch_cores) { + const CoreCoord& enqueue_program_physical_dispatch_core) { string output_string = generate_noc_addr_ranges_string(pcie_addr_base, pcie_addr_size, dram_addr_base, dram_addr_size, - pcie_cores, dram_cores, ethernet_cores, grid_size, harvested_rows, dispatch_cores); + pcie_cores, dram_cores, ethernet_cores, grid_size, harvested_rows, enqueue_program_physical_dispatch_core); ofstream file_stream_br(path + "/brisc/noc_addr_ranges_gen.h"); file_stream_br << output_string; diff --git a/tt_metal/jit_build/genfiles.hpp b/tt_metal/jit_build/genfiles.hpp index edb116c5be1..aee8841df30 100644 --- a/tt_metal/jit_build/genfiles.hpp +++ b/tt_metal/jit_build/genfiles.hpp @@ -39,7 +39,7 @@ void jit_build_genfiles_noc_addr_ranges_header( const std::vector& ethernet_cores, CoreCoord grid_size, const std::vector& harvested_rows, - const std::vector& dispatch_cores); + const CoreCoord& enqueue_program_physical_dispatch_core); void jit_build_genfiles_descriptors(const JitBuildEnv& env, JitBuildOptions& options); diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 307c40a25ba..a9c2cc72bc2 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -163,7 +163,15 @@ void Cluster::get_metal_desc_from_tt_desc( } void Cluster::open_driver(chip_id_t mmio_device_id, const std::set &controlled_device_ids, const bool &skip_driver_allocs) { - const std::string sdesc_path = get_soc_description_file(this->arch_, this->target_type_); + + // This is short-lived. To be removed after @abhullar's refactor of pulling out banking/dispatch info + // out of the yaml files + const char *NUM_HW_CQS = std::getenv("TT_METAL_NUM_HW_CQS"); + uint8_t num_hw_cqs = 1; + if (NUM_HW_CQS != nullptr) { + num_hw_cqs = std::stoi(NUM_HW_CQS); + } + const std::string sdesc_path = get_soc_description_file(this->arch_, this->target_type_, num_hw_cqs); std::unique_ptr device_driver; if (this->target_type_ == TargetDevice::Silicon) { diff --git a/tt_metal/soc_descriptors/grayskull_120_arch.yaml b/tt_metal/soc_descriptors/grayskull_120_arch_one_cq.yaml similarity index 95% rename from tt_metal/soc_descriptors/grayskull_120_arch.yaml rename to tt_metal/soc_descriptors/grayskull_120_arch_one_cq.yaml index c67bb964f48..104f8a520ed 100644 --- a/tt_metal/soc_descriptors/grayskull_120_arch.yaml +++ b/tt_metal/soc_descriptors/grayskull_120_arch_one_cq.yaml @@ -88,8 +88,11 @@ dispatch_and_banking: storage_cores: # Relative Only [[1, -1],[2, -1],[3, -1],[4, -1],[5, -1],[7, -1],[8, -1],[9, -1],[10, -1],[11, -1]] - dispatch_cores: # Relative Only - [[0, -1],[6, -1]] + producer_cores: # Relative Only + [[0, -1]] + + consumer_cores: # Relative Only + [[6, -1]] E75: l1_bank_size: diff --git a/tt_metal/soc_descriptors/grayskull_120_arch_two_cqs.yaml b/tt_metal/soc_descriptors/grayskull_120_arch_two_cqs.yaml new file mode 100644 index 00000000000..85796aaffcb --- /dev/null +++ b/tt_metal/soc_descriptors/grayskull_120_arch_two_cqs.yaml @@ -0,0 +1,109 @@ +# soc-descriptor yaml +# Anything using [#-#] is noc coordinates +# Anything using [[#, #]] is logical coordinates (Can be relative) +# relative index: 0 means first row, -1 means last row of functional grid... + +grid: + x_size: 13 + y_size: 12 + +arc: + [0-2] + +pcie: + [0-4] + +dram: + [[1-0], [1-6], [4-0], [4-6], [7-0], [7-6], [10-0], [10-6]] + +dram_preferred_eth_endpoint: + [ 1-0, 1-6, 4-0, 4-6, 7-0, 7-6, 10-0, 10-6 ] + +dram_preferred_worker_endpoint: + [ 1-0, 1-6, 4-0, 4-6, 7-0, 7-6, 10-0, 10-6 ] + +dram_address_offsets: + [ 0, 0, 0, 0, 0, 0, 0, 0 ] + +eth: + [] + +functional_workers: + [ + 1-1, 1-2, 1-3, 1-4, 1-5, 1-7, 1-8, 1-9, 1-10, 1-11, + 2-1, 2-2, 2-3, 2-4, 2-5, 2-7, 2-8, 2-9, 2-10, 2-11, + 3-1, 3-2, 3-3, 3-4, 3-5, 3-7, 3-8, 3-9, 3-10, 3-11, + 4-1, 4-2, 4-3, 4-4, 4-5, 4-7, 4-8, 4-9, 4-10, 4-11, + 5-1, 5-2, 5-3, 5-4, 5-5, 5-7, 5-8, 5-9, 5-10, 5-11, + 6-1, 6-2, 6-3, 6-4, 6-5, 6-7, 6-8, 6-9, 6-10, 6-11, + 7-1, 7-2, 7-3, 7-4, 7-5, 7-7, 7-8, 7-9, 7-10, 7-11, + 8-1, 8-2, 8-3, 8-4, 8-5, 8-7, 8-8, 8-9, 8-10, 8-11, + 9-1, 9-2, 9-3, 9-4, 9-5, 9-7, 9-8, 9-9, 9-10, 9-11, + 10-1, 10-2, 10-3, 10-4, 10-5, 10-7, 10-8, 10-9, 10-10, 10-11, + 11-1, 11-2, 11-3, 11-4, 11-5, 11-7, 11-8, 11-9, 11-10, 11-11, + 12-1, 12-2, 12-3, 12-4, 12-5, 12-7, 12-8, 12-9, 12-10, 12-11 + ] + +harvested_workers: + [] + +router_only: + [ + 0-0, 0-11, 0-1, 0-10, 0-9, 0-3, 0-8, 0-7, 0-5, 0-6, + 12-0, 11-0, 2-0, 3-0, 9-0, 8-0, 5-0, 6-0, + 12-6, 11-6, 2-6, 3-6, 9-6, 8-6, 5-6, 6-6 + ] + +worker_l1_size: + 1048576 + +dram_bank_size: + 1073741824 + +eth_l1_size: + 0 + +arch_name: GRAYSKULL + +features: + unpacker: + version: 1 + inline_srca_trans_without_srca_trans_instr: False + math: + dst_size_alignment: 32768 + packer: + version: 1 + overlay: + version: 1 + +dispatch_and_banking: + E150: + l1_bank_size: + 524288 + + compute_with_storage_grid_range: # Logical only start and end [x, y] + start: [0, 0] + end: [11, 8] + + storage_cores: # Relative Only + [[2, -1],[3, -1],[4, -1],[5, -1],[8, -1],[9, -1],[10, -1],[11, -1]] + + producer_cores: # Relative Only + [[0, -1], [1, -1]] + + consumer_cores: # Relative Only + [[6, -1], [7, -1]] + + E75: + l1_bank_size: + 1048576 + + compute_with_storage_grid_range: # Logical only start and end [x, y] + start: [0, 0] + end: [10, 7] + + storage_cores: # Relative Only + [[11, 1], [11, 2], [11, 3], [11, 5], [11, 6], [11, 7]] + + dispatch_cores: # Relative Only + [[11, 0], [11, 4]]