diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp index 6c8313da808..050e7563035 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp @@ -63,3 +63,35 @@ class CommandQueueMultiDeviceFixture : public ::testing::Test { tt::ARCH arch_; size_t num_devices_; }; + +class CommandQueuePCIDevicesFixture : public ::testing::Test { + protected: + void SetUp() override { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + GTEST_SKIP(); + } + arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); + + num_devices_ = tt::tt_metal::GetNumPCIeDevices(); + if (num_devices_ < 2) { + GTEST_SKIP(); + } + + for (unsigned int id = 0; id < num_devices_; id++) { + auto* device = tt::tt_metal::CreateDevice(id); + devices_.push_back(device); + } + } + + void TearDown() override { + for (unsigned int id = 0; id < devices_.size(); id++) { + tt::tt_metal::CloseDevice(devices_.at(id)); + } + } + + std::vector devices_; + tt::ARCH arch_; + size_t num_devices_; +}; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp new file mode 100644 index 00000000000..8f396b4326e --- /dev/null +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp @@ -0,0 +1,788 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include + +#include "command_queue_fixture.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/impl/kernels/kernel.hpp" +#include "tt_metal/test_utils/comparison.hpp" +#include "tt_metal/test_utils/df/df.hpp" +#include "tt_metal/test_utils/print_helpers.hpp" +#include "tt_metal/test_utils/stimulus.hpp" + +using namespace tt; +using namespace tt::test_utils; +using namespace tt::test_utils::df; + +constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet +constexpr std::int32_t MAX_NUM_WORDS = + (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; +constexpr std::int32_t MAX_BUFFER_SIZE = + (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); + +struct BankedConfig { + size_t num_pages = 1; + size_t size_bytes = 1 * 2 * 32 * 32; + size_t page_size_bytes = 2 * 32 * 32; + BufferType input_buffer_type = BufferType::L1; + BufferType output_buffer_type = BufferType::L1; + tt::DataFormat l1_data_format = tt::DataFormat::Float16_b; +}; + +namespace fd_unit_tests::erisc::kernels { + +const size_t get_rand_32_byte_aligned_address(const size_t& base, const size_t& max) { + TT_ASSERT(!(base & 0x1F) and !(max & 0x1F)); + size_t word_size = (max >> 5) - (base >> 5); + return (((rand() % word_size) << 5) + base); +} + +bool test_dummy_EnqueueProgram_with_runtime_args(Device* device, const CoreCoord& eth_core_coord) { + Program program; + bool pass = true; + auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_core_coord); + + auto dummy_kernel0 = CreateKernel( + program, + "tests/tt_metal/tt_metal/gtest_unit_tests/command_queue/test_kernels/runtime_args_kernel0.cpp", + eth_core_coord, + tt_metal::experimental::EthernetConfig{.eth_mode = tt_metal::Eth::SENDER, .noc = tt_metal::NOC::NOC_0}); + + vector dummy_kernel0_args = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + + // zero out expected L1 values + std::vector all_zeros(dummy_kernel0_args.size(), 0); + llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_l1_mem::address_map::ERISC_L1_ARG_BASE); + + tt::tt_metal::SetRuntimeArgs(program, dummy_kernel0, eth_core_coord, dummy_kernel0_args); + + tt::tt_metal::detail::CompileProgram(device, program); + auto& cq = tt::tt_metal::detail::GetCommandQueue(device); + EnqueueProgram(cq, program, false); + Finish(cq); + + vector dummy_kernel0_args_readback = llrt::read_hex_vec_from_core( + device->id(), + eth_noc_xy, + eth_l1_mem::address_map::ERISC_L1_ARG_BASE, + dummy_kernel0_args.size() * sizeof(uint32_t)); + + pass &= (dummy_kernel0_args == dummy_kernel0_args_readback); + + return pass; +} + +bool reader_kernel_no_send( + tt_metal::Device* device, + const size_t& byte_size, + const size_t& eth_l1_byte_address, + const CoreCoord& eth_reader_core) { + bool pass = true; + //////////////////////////////////////////////////////////////////////////// + // Application Setup + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program program = tt_metal::Program(); + + tt::tt_metal::InterleavedBufferConfig dram_config{ + .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM}; + + auto input_dram_buffer = CreateBuffer(dram_config); + uint32_t dram_byte_address = input_dram_buffer.address(); + auto dram_noc_xy = input_dram_buffer.noc_coordinates(); + auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core); + log_debug( + tt::LogTest, + "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", + device->id(), + byte_size, + dram_noc_xy.str(), + dram_byte_address, + eth_reader_core.str(), + eth_l1_byte_address); + + auto eth_reader_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp", + eth_reader_core, + tt_metal::experimental::EthernetConfig{.eth_mode = tt_metal::Eth::SENDER, .noc = tt_metal::NOC::NOC_0}); + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs); + + // Clear expected value at ethernet L1 address + std::vector all_zeros(inputs.size(), 0); + llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_l1_byte_address); + + tt_metal::SetRuntimeArgs( + program, + eth_reader_kernel, + eth_reader_core, + { + (uint32_t)dram_byte_address, + (uint32_t)dram_noc_xy.x, + (uint32_t)dram_noc_xy.y, + (uint32_t)byte_size, + (uint32_t)eth_l1_byte_address, + }); + + auto& cq = tt::tt_metal::detail::GetCommandQueue(device); + tt::tt_metal::detail::CompileProgram(device, program); + EnqueueProgram(cq, program, false); + Finish(cq); + + auto readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_l1_byte_address, byte_size); + pass &= (readback_vec == inputs); + if (not pass) { + std::cout << "Mismatch at Core: " << eth_noc_xy.str() << std::endl; + } + return pass; +} + +bool writer_kernel_no_receive( + tt_metal::Device* device, + const size_t& byte_size, + const size_t& eth_l1_byte_address, + const CoreCoord& eth_writer_core) { + bool pass = true; + //////////////////////////////////////////////////////////////////////////// + // Application Setup + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program program = tt_metal::Program(); + + tt::tt_metal::InterleavedBufferConfig dram_config{ + .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM}; + + auto output_dram_buffer = CreateBuffer(dram_config); + uint32_t dram_byte_address = output_dram_buffer.address(); + auto dram_noc_xy = output_dram_buffer.noc_coordinates(); + auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core); + log_debug( + tt::LogTest, + "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", + device->id(), + byte_size, + eth_writer_core.str(), + eth_l1_byte_address, + dram_noc_xy.str(), + dram_byte_address); + + auto eth_writer_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp", + eth_writer_core, + tt_metal::experimental::EthernetConfig{.eth_mode = tt_metal::Eth::SENDER, .noc = tt_metal::NOC::NOC_0}); + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, inputs, eth_l1_byte_address); + + // Clear expected value at ethernet L1 address + std::vector all_zeros(inputs.size(), 0); + tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros); + + tt_metal::SetRuntimeArgs( + program, + eth_writer_kernel, + eth_writer_core, + { + (uint32_t)dram_byte_address, + (uint32_t)dram_noc_xy.x, + (uint32_t)dram_noc_xy.y, + (uint32_t)byte_size, + (uint32_t)eth_l1_byte_address, + }); + + auto& cq = tt::tt_metal::detail::GetCommandQueue(device); + tt::tt_metal::detail::CompileProgram(device, program); + EnqueueProgram(cq, program, false); + Finish(cq); + + auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size); + pass &= (readback_vec == inputs); + if (not pass) { + std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl; + } + return pass; +} + +bool eth_direct_sender_receiver_kernels( + tt_metal::Device* sender_device, + tt_metal::Device* receiver_device, + const size_t& byte_size, + const size_t& src_eth_l1_byte_address, + const size_t& dst_eth_l1_byte_address, + const CoreCoord& eth_sender_core, + const CoreCoord& eth_receiver_core, + uint32_t num_bytes_per_send = 16) { + bool pass = true; + log_debug( + tt::LogTest, + "Sending {} bytes from device {} eth core {} addr {} to device {} eth core {} addr {}", + byte_size, + sender_device->id(), + eth_sender_core.str(), + src_eth_l1_byte_address, + receiver_device->id(), + eth_receiver_core.str(), + dst_eth_l1_byte_address); + // Generate inputs + auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + llrt::write_hex_vec_to_core( + sender_device->id(), + sender_device->ethernet_core_from_logical_core(eth_sender_core), + inputs, + src_eth_l1_byte_address); + + // Clear expected value at ethernet L1 address + std::vector all_zeros(inputs.size(), 0); + llrt::write_hex_vec_to_core( + receiver_device->id(), + receiver_device->ethernet_core_from_logical_core(eth_receiver_core), + all_zeros, + dst_eth_l1_byte_address); + + //////////////////////////////////////////////////////////////////////////// + // Sender Device + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program sender_program = tt_metal::Program(); + + auto eth_sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_send.cpp", + eth_sender_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::SENDER, + .noc = tt_metal::NOC::NOC_0, + .compile_args = {uint32_t(num_bytes_per_send), uint32_t(num_bytes_per_send >> 4)}}); + + tt_metal::SetRuntimeArgs( + sender_program, + eth_sender_kernel, + eth_sender_core, + { + (uint32_t)src_eth_l1_byte_address, + (uint32_t)dst_eth_l1_byte_address, + (uint32_t)byte_size, + }); + + //////////////////////////////////////////////////////////////////////////// + // Receiver Device + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program receiver_program = tt_metal::Program(); + + auto eth_receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_receive.cpp", + eth_receiver_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::RECEIVER, .noc = tt_metal::NOC::NOC_0}); // probably want to use NOC_1 here + + tt_metal::SetRuntimeArgs( + receiver_program, + eth_receiver_kernel, + eth_receiver_core, + { + (uint32_t)byte_size, + }); + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + tt::tt_metal::detail::CompileProgram(sender_device, sender_program); + auto& sender_cq = tt::tt_metal::detail::GetCommandQueue(sender_device); + + tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program); + auto& receiver_cq = tt::tt_metal::detail::GetCommandQueue(receiver_device); + + EnqueueProgram(sender_cq, sender_program, false); + EnqueueProgram(receiver_cq, receiver_program, false); + Finish(sender_cq); + Finish(receiver_cq); + + auto readback_vec = llrt::read_hex_vec_from_core( + receiver_device->id(), + receiver_device->ethernet_core_from_logical_core(eth_receiver_core), + dst_eth_l1_byte_address, + byte_size); + pass &= (readback_vec == inputs); + if (not pass) { + std::cout << "Mismatch at Core: " << eth_receiver_core.str() << std::endl; + std::cout << readback_vec[0] << std::endl; + } + return pass; +} + +bool chip_to_chip_dram_buffer_transfer( + tt_metal::Device* sender_device, + tt_metal::Device* receiver_device, + const CoreCoord& eth_sender_core, + const CoreCoord& eth_receiver_core, + const size_t& byte_size) { + bool pass = true; + + tt::tt_metal::InterleavedBufferConfig sender_dram_config{ + .device = sender_device, + .size = byte_size, + .page_size = byte_size, + .buffer_type = tt::tt_metal::BufferType::DRAM}; + tt::tt_metal::InterleavedBufferConfig receiver_dram_config{ + .device = receiver_device, + .size = byte_size, + .page_size = byte_size, + .buffer_type = tt::tt_metal::BufferType::DRAM}; + + // Create source buffer on sender device + auto input_dram_buffer = CreateBuffer(sender_dram_config); + uint32_t input_dram_byte_address = input_dram_buffer.address(); + auto input_dram_noc_xy = input_dram_buffer.noc_coordinates(); + + // Create dest buffer on receiver device + auto output_dram_buffer = CreateBuffer(receiver_dram_config); + uint32_t output_dram_byte_address = output_dram_buffer.address(); + auto output_dram_noc_xy = output_dram_buffer.noc_coordinates(); + + // Generate inputs + auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + + tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs); + + const uint32_t MAX_BUFFER = + (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); + uint32_t num_loops = (uint32_t)(byte_size / MAX_BUFFER); + uint32_t remaining_bytes = (uint32_t)(byte_size % MAX_BUFFER); + // Clear expected value at ethernet L1 address + std::vector all_zeros(inputs.size(), 0); + + tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros); + + //////////////////////////////////////////////////////////////////////////// + // Sender Device + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program sender_program = tt_metal::Program(); + + auto eth_sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp", + eth_sender_core, + tt_metal::experimental::EthernetConfig{.eth_mode = tt_metal::Eth::SENDER, .noc = tt_metal::NOC::NOC_0}); + + tt_metal::SetRuntimeArgs( + sender_program, + eth_sender_kernel, + eth_sender_core, + { + (uint32_t)input_dram_byte_address, + (uint32_t)input_dram_noc_xy.x, + (uint32_t)input_dram_noc_xy.y, + (uint32_t)remaining_bytes, + (uint32_t)num_loops, + (uint32_t)MAX_BUFFER, + }); + + //////////////////////////////////////////////////////////////////////////// + // Receiver Device + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program receiver_program = tt_metal::Program(); + + auto eth_receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp", + eth_receiver_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::RECEIVER, .noc = tt_metal::NOC::NOC_0}); // probably want to use NOC_1 here + + tt_metal::SetRuntimeArgs( + receiver_program, + eth_receiver_kernel, + eth_receiver_core, + { + (uint32_t)output_dram_byte_address, + (uint32_t)output_dram_noc_xy.x, + (uint32_t)output_dram_noc_xy.y, + (uint32_t)remaining_bytes, + (uint32_t)num_loops, + (uint32_t)MAX_BUFFER, + }); + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + tt::tt_metal::detail::CompileProgram(sender_device, sender_program); + auto& sender_cq = tt::tt_metal::detail::GetCommandQueue(sender_device); + + tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program); + auto& receiver_cq = tt::tt_metal::detail::GetCommandQueue(receiver_device); + + EnqueueProgram(sender_cq, sender_program, false); + EnqueueProgram(receiver_cq, receiver_program, false); + Finish(sender_cq); + Finish(receiver_cq); + + std::vector dest_dram_data; + tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_dram_data); + pass &= (dest_dram_data == inputs); + if (not pass) { + std::cout << "Mismatch at Core: " << output_dram_noc_xy.str() << std::endl; + std::cout << dest_dram_data[0] << std::endl; + } + return pass; +} + +bool chip_to_chip_interleaved_buffer_transfer( + tt_metal::Device* sender_device, + tt_metal::Device* receiver_device, + const CoreCoord& eth_sender_core, + const CoreCoord& eth_receiver_core, + const BankedConfig& cfg, + const uint32_t& max_transfer_size) { + bool pass = true; + + const uint32_t input0_cb_index = 0; + const uint32_t output_cb_index = 16; + + TT_FATAL(cfg.num_pages * cfg.page_size_bytes == cfg.size_bytes); + constexpr uint32_t num_pages_cb = 1; + + //////////////////////////////////////////////////////////////////////////// + // Sender Device + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program sender_program = tt_metal::Program(); + + auto input_packed = generate_uniform_random_vector(0, 100, cfg.size_bytes / sizeof(uint32_t)); + + tt::tt_metal::InterleavedBufferConfig sender_config{ + .device = sender_device, + .size = cfg.size_bytes, + .page_size = cfg.page_size_bytes, + .buffer_type = cfg.input_buffer_type}; + tt::tt_metal::InterleavedBufferConfig receiver_config{ + .device = receiver_device, + .size = cfg.size_bytes, + .page_size = cfg.page_size_bytes, + .buffer_type = cfg.output_buffer_type}; + auto input_buffer = CreateBuffer(sender_config); + bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM; + + tt_metal::detail::WriteToBuffer(input_buffer, input_packed); + + const uint32_t max_buffer = round_down(max_transfer_size, cfg.page_size_bytes); + uint32_t pages_per_loop = max_buffer / cfg.page_size_bytes; + uint32_t num_loops = (uint32_t)(cfg.size_bytes / max_buffer); + uint32_t remaining_bytes = (uint32_t)(cfg.size_bytes % max_buffer); + uint32_t remaining_pages = remaining_bytes / cfg.page_size_bytes; + + auto eth_sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_buffer_to_buffer_sender.cpp", + eth_sender_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::SENDER, .noc = tt_metal::NOC::NOC_0, .compile_args = {(uint32_t)input_is_dram}}); + + tt_metal::SetRuntimeArgs( + sender_program, + eth_sender_kernel, + eth_sender_core, + {(uint32_t)input_buffer.address(), + (uint32_t)cfg.page_size_bytes, + (uint32_t)max_buffer, + (uint32_t)num_loops, + (uint32_t)pages_per_loop, + (uint32_t)remaining_bytes, + (uint32_t)remaining_pages}); + + //////////////////////////////////////////////////////////////////////////// + // Receiver Device + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program receiver_program = tt_metal::Program(); + + auto output_buffer = CreateBuffer(receiver_config); + bool output_is_dram = cfg.output_buffer_type == BufferType::DRAM; + std::vector all_zeros(cfg.size_bytes / sizeof(uint32_t), 0); + + tt_metal::detail::WriteToBuffer(output_buffer, all_zeros); + + auto eth_receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_buffer_to_buffer_receiver.cpp", + eth_receiver_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::RECEIVER, + .noc = tt_metal::NOC::NOC_1, + .compile_args = {(uint32_t)output_is_dram}}); + + tt_metal::SetRuntimeArgs( + receiver_program, + eth_receiver_kernel, + eth_receiver_core, + { + (uint32_t)output_buffer.address(), + (uint32_t)cfg.page_size_bytes, + (uint32_t)max_buffer, + (uint32_t)num_loops, + (uint32_t)pages_per_loop, + (uint32_t)remaining_bytes, + (uint32_t)remaining_pages, + }); + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + tt::tt_metal::detail::CompileProgram(sender_device, sender_program); + auto& sender_cq = tt::tt_metal::detail::GetCommandQueue(sender_device); + + tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program); + auto& receiver_cq = tt::tt_metal::detail::GetCommandQueue(receiver_device); + + EnqueueProgram(sender_cq, sender_program, false); + EnqueueProgram(receiver_cq, receiver_program, false); + Finish(sender_cq); + Finish(receiver_cq); + + std::vector dest_buffer_data; + tt_metal::detail::ReadFromBuffer(output_buffer, dest_buffer_data); + pass &= input_packed == dest_buffer_data; + return pass; +} +} // namespace fd_unit_tests::erisc::kernels + +TEST_F(CommandQueuePCIDevicesFixture, EnqueueDummyProgramOnEthCore) { + for (const auto& device : devices_) { + for (const auto& eth_core : device->get_active_ethernet_cores()) { + ASSERT_TRUE(fd_unit_tests::erisc::kernels::test_dummy_EnqueueProgram_with_runtime_args(device, eth_core)); + } + } +} + +TEST_F(CommandQueuePCIDevicesFixture, EthKernelsNocReadNoSend) { + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& device : devices_) { + for (const auto& eth_core : device->get_active_ethernet_cores()) { + ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send( + device, WORD_SIZE, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send( + device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send( + device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); + } + } +} + +TEST_F(CommandQueuePCIDevicesFixture, EthKernelsNocWriteNoReceive) { + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& device : devices_) { + for (const auto& eth_core : device->get_active_ethernet_cores()) { + ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive( + device, WORD_SIZE, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive( + device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive( + device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); + } + } +} + +TEST_F(CommandQueuePCIDevicesFixture, EthKernelsDirectSendAllConnectedChips) { + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + for (const auto& sender_device : devices_) { + for (const auto& receiver_device : devices_) { + if (sender_device->id() == receiver_device->id()) { + continue; + } + for (const auto& sender_core : sender_device->get_active_ethernet_cores()) { + auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core); + if (receiver_device->id() != device_id) { + continue; + } + ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( + sender_device, + receiver_device, + WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( + sender_device, + receiver_device, + 4 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( + sender_device, + receiver_device, + 256 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( + sender_device, + receiver_device, + 1000 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + } + } + } +} + +TEST_F(CommandQueuePCIDevicesFixture, EthKernelsRandomDirectSendTests) { + srand(0); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + std::map, std::tuple> connectivity = {}; + for (const auto& sender_core : device_0->get_active_ethernet_cores()) { + const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core); + if (std::get<0>(receiver_core) != device_1->id()) { + continue; + } + connectivity.insert({{device_0->id(), sender_core}, receiver_core}); + } + for (const auto& sender_core : device_1->get_active_ethernet_cores()) { + const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core); + if (std::get<0>(receiver_core) != device_0->id()) { + continue; + } + connectivity.insert({{device_1->id(), sender_core}, receiver_core}); + } + for (int i = 0; i < 1000; i++) { + auto it = connectivity.begin(); + std::advance(it, rand() % (connectivity.size())); + + const auto& send_chip = devices_.at(std::get<0>(it->first)); + CoreCoord sender_core = std::get<1>(it->first); + const auto& receiver_chip = devices_.at(std::get<0>(it->second)); + CoreCoord receiver_core = std::get<1>(it->second); + + const size_t src_eth_l1_byte_address = fd_unit_tests::erisc::kernels::get_rand_32_byte_aligned_address( + eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE); + const size_t dst_eth_l1_byte_address = fd_unit_tests::erisc::kernels::get_rand_32_byte_aligned_address( + eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE); + + int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - + std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) / + WORD_SIZE; + int num_words = rand() % max_words + 1; + + ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( + send_chip, + receiver_chip, + WORD_SIZE * num_words, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + } +} + +TEST_F(CommandQueuePCIDevicesFixture, EthKernelsSendDramBufferAllConnectedChips) { + for (const auto& sender_device : devices_) { + for (const auto& receiver_device : devices_) { + if (sender_device->id() == receiver_device->id()) { + continue; + } + for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores()) { + auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); + if (receiver_device->id() != device_id) { + continue; + } + log_info( + tt::LogTest, + "Sending dram buffer from device {} to device {}, using eth core {} and {}", + sender_device->id(), + receiver_device->id(), + sender_eth_core.str(), + receiver_eth_core.str()); + + ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( + sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( + sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( + sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( + sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024)); + } + } + } +} + +TEST_F(CommandQueuePCIDevicesFixture, EthKernelsSendInterleavedBufferAllConnectedChips) { + for (const auto& sender_device : devices_) { + for (const auto& receiver_device : devices_) { + if (sender_device->id() == receiver_device->id()) { + continue; + } + for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores()) { + auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); + if (receiver_device->id() != device_id) { + continue; + } + + log_info( + tt::LogTest, + "Sending interleaved buffer from device {} to device {}, using eth core {} and {}", + sender_device->id(), + receiver_device->id(), + sender_eth_core.str(), + receiver_eth_core.str()); + BankedConfig test_config = BankedConfig{ + .num_pages = 200, + .size_bytes = 200 * 2 * 32 * 32, + .page_size_bytes = 2 * 32 * 32, + .input_buffer_type = BufferType::L1, + .output_buffer_type = BufferType::DRAM}; + + ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + sender_device, + receiver_device, + sender_eth_core, + receiver_eth_core, + test_config, + test_config.page_size_bytes)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); + test_config = BankedConfig{ + .num_pages = 200, + .size_bytes = 200 * 2 * 32 * 32, + .page_size_bytes = 2 * 32 * 32, + .input_buffer_type = BufferType::DRAM, + .output_buffer_type = BufferType::L1}; + ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + sender_device, + receiver_device, + sender_eth_core, + receiver_eth_core, + test_config, + test_config.page_size_bytes)); + ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); + } + } + } +} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp new file mode 100644 index 00000000000..5fa5896843b --- /dev/null +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp @@ -0,0 +1,482 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include + +#include "command_queue_fixture.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/impl/kernels/kernel.hpp" +#include "tt_metal/test_utils/comparison.hpp" +#include "tt_metal/test_utils/df/df.hpp" +#include "tt_metal/test_utils/print_helpers.hpp" +#include "tt_metal/test_utils/stimulus.hpp" + +using namespace tt; +using namespace tt::test_utils; +using namespace tt::test_utils::df; + +constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet +constexpr std::int32_t MAX_NUM_WORDS = + (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; +constexpr std::int32_t MAX_BUFFER_SIZE = + (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); + +struct BankedConfig { + size_t num_pages = 1; + size_t size_bytes = 1 * 2 * 32 * 32; + size_t page_size_bytes = 2 * 32 * 32; + BufferType input_buffer_type = BufferType::L1; + BufferType output_buffer_type = BufferType::L1; + tt::DataFormat l1_data_format = tt::DataFormat::Float16_b; +}; + +std::vector get_hamiltonian_cycle(vector>& adj, int N, int s = 0) { + std::vector> dp(N, std::vector(1 << N, -1)); + + for (int i = 0; i < N; ++i) { + if (adj[s][i]) { + dp[i][(1 << i)] = i; + } + } + + for (int i = 0; i < (1 << N); ++i) { + for (int j = 0; j < N; ++j) { + if (i & (1 << j)) { + for (int k = 0; k < N; ++k) { + if (i & (1 << k) && adj[k][j] && j != k && dp[k][i ^ (1 << j)] != -1) { + dp[j][i] = k; + break; + } + } + } + } + } + + for (int i = 0; i < N; ++i) { + int m = (1 << N) - 1; + + if (dp[i][m] != -1 && i == s) { + std::vector path; + path.reserve(N + 1); + path.push_back(i); + + for (int j = 0; j < N - 1; ++j) { + path.push_back(dp[*path.rbegin()][m]); + m ^= 1 << *(path.rbegin() + 1); + } + path.push_back(s); + return path; + } + } + return {}; +} + +std::vector get_device_ring(std::vector devices) { + std::vector> adj(devices.size(), std::vector(devices.size(), 0)); + for (uint32_t i = 0; i < devices.size(); ++i) { + const auto& device = devices[i]; + for (const auto& connected_device_id : device->get_ethernet_connected_chip_ids()) { + for (uint32_t j = 0; j < devices.size(); ++j) { + if (devices[j]->id() == connected_device_id) { + adj[i][j] = 1; + } + } + } + } + + const auto& device_ring_idx = get_hamiltonian_cycle(adj, devices.size(), 0); + std::vector device_ring; + device_ring.reserve(device_ring_idx.size()); + for (const auto& device_idx : device_ring_idx) { + device_ring.push_back(devices[device_idx]); + } + return device_ring; +} + +std::vector> get_sender_receiver_cores( + std::vector device_ring) { + std::vector> sender_receivers; + sender_receivers.reserve(device_ring.size() - 1); + + // Special case for 2 devices to ensure core pairs are not the same for send and receive + if (device_ring.size() - 1 == 2) { + const auto& first_device = device_ring[0]; + const auto& second_device = device_ring[1]; + uint32_t i = 0; + for (const auto& first_eth_core : first_device->get_active_ethernet_cores()) { + auto [device_id, second_eth_core] = first_device->get_connected_ethernet_core(first_eth_core); + if (second_device->id() == device_id) { + Device *sender_device, *receiver_device; + CoreCoord sender_eth_core, receiver_eth_core; + if (i == 0) { + sender_device = first_device, receiver_device = second_device; + sender_eth_core = first_eth_core, receiver_eth_core = second_eth_core; + } else { + sender_device = second_device, receiver_device = first_device; + sender_eth_core = second_eth_core, receiver_eth_core = first_eth_core; + } + sender_receivers.push_back({sender_device, receiver_device, sender_eth_core, receiver_eth_core}); + log_info( + tt::LogTest, + "Sender: {} Receiver: {} Sender Eth: {} Receiver Eth: {}", + sender_device->id(), + receiver_device->id(), + sender_eth_core.str(), + receiver_eth_core.str()); + if (i > 0) { + break; + } + i++; + } + } + } else { + for (uint32_t i = 0; i < device_ring.size() - 1; ++i) { + const auto& sender_device = device_ring[i]; + const auto& receiver_device = device_ring[i + 1]; + for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores()) { + auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); + if (receiver_device->id() == device_id) { + sender_receivers.push_back({sender_device, receiver_device, sender_eth_core, receiver_eth_core}); + log_info( + tt::LogTest, + "Sender: {} Receiver: {} Sender Eth: {} Receiver Eth: {}", + sender_device->id(), + receiver_device->id(), + sender_eth_core.str(), + receiver_eth_core.str()); + break; + } + } + } + } + return sender_receivers; +} + +namespace fd_unit_tests::erisc::kernels { + +bool eth_direct_ring_gather_sender_receiver_kernels( + std::vector device_ring, + const size_t& byte_size_per_device, + const size_t& src_eth_l1_byte_address, + const size_t& dst_eth_l1_byte_address, + const size_t& sem_l1_byte_address, + uint32_t num_bytes_per_send = 16) { + bool pass = true; + const auto& sender_receivers = get_sender_receiver_cores(device_ring); + + // Generate inputs + uint32_t numel = byte_size_per_device / sizeof(uint32_t); + std::vector> inputs; + inputs.reserve(sender_receivers.size()); + std::vector all_zeros(numel * sender_receivers.size(), 0); + std::map programs; + std::vector full_input; + full_input.reserve(numel * sender_receivers.size()); + + for (uint32_t i = 0; i < sender_receivers.size(); ++i) { + inputs.emplace_back( + generate_uniform_random_vector(0, 100, byte_size_per_device / sizeof(uint32_t), i)); + full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end()); + + //////////////////////////////////////////////////////////////////////////// + // Sender Device + //////////////////////////////////////////////////////////////////////////// + const auto& [sender_device, receiver_device, eth_sender_core, eth_receiver_core] = sender_receivers[i]; + auto& sender_program = programs[sender_device->id()]; + auto& receiver_program = programs[receiver_device->id()]; + CoreCoord sender_receiver_core; + for (uint32_t j = 0; j < sender_receivers.size(); ++j) { + if (std::get<1>(sender_receivers[j])->id() == sender_device->id()) { + sender_receiver_core = sender_device->ethernet_core_from_logical_core(std::get<3>(sender_receivers[j])); + } + } + auto eth_sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_ring_gather_send.cpp", + eth_sender_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::SENDER, + .noc = tt_metal::NOC::NOC_0, + .compile_args = { + uint32_t(num_bytes_per_send), + uint32_t(num_bytes_per_send >> 4), + uint32_t(sender_receiver_core.x), + uint32_t(sender_receiver_core.y)}}); + + tt_metal::SetRuntimeArgs( + sender_program, + eth_sender_kernel, + eth_sender_core, + {(uint32_t)(src_eth_l1_byte_address + (sender_receivers.size() - 1) * byte_size_per_device), + (uint32_t)dst_eth_l1_byte_address, + (uint32_t)byte_size_per_device, + (uint32_t)sender_receivers.size() - 1, + (uint32_t)(src_eth_l1_byte_address + i * byte_size_per_device), + (uint32_t)i, + (uint32_t)sem_l1_byte_address}); + + llrt::write_hex_vec_to_core( + sender_device->id(), + sender_device->ethernet_core_from_logical_core(eth_sender_core), + inputs[i], + src_eth_l1_byte_address + i * byte_size_per_device); + llrt::write_hex_vec_to_core( + sender_device->id(), + sender_device->ethernet_core_from_logical_core(eth_sender_core), + {INVALID}, + sem_l1_byte_address); + + //////////////////////////////////////////////////////////////////////////// + // Receiver Device + //////////////////////////////////////////////////////////////////////////// + // Clear expected value at ethernet L1 address + CoreCoord receiver_sender_core; + for (uint32_t j = 0; j < sender_receivers.size(); ++j) { + if (std::get<0>(sender_receivers[j])->id() == receiver_device->id()) { + receiver_sender_core = + receiver_device->ethernet_core_from_logical_core(std::get<2>(sender_receivers[j])); + } + } + + llrt::write_hex_vec_to_core( + receiver_device->id(), + receiver_device->ethernet_core_from_logical_core(eth_receiver_core), + all_zeros, + dst_eth_l1_byte_address); + llrt::write_hex_vec_to_core( + receiver_device->id(), + receiver_device->ethernet_core_from_logical_core(eth_receiver_core), + {INVALID}, + sem_l1_byte_address); + auto eth_receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_ring_gather_receive.cpp", + eth_receiver_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::RECEIVER, + .noc = tt_metal::NOC::NOC_1, + .compile_args = { + uint32_t(receiver_sender_core.x), + uint32_t(receiver_sender_core.y)}}); // probably want to use NOC_1 here + + tt_metal::SetRuntimeArgs( + receiver_program, + eth_receiver_kernel, + eth_receiver_core, + {(uint32_t)byte_size_per_device, (uint32_t)sender_receivers.size() - 1, (uint32_t)sem_l1_byte_address}); + } + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + std::vector cqs; + for (uint32_t i = 0; i < sender_receivers.size(); ++i) { + const auto& device = std::get<0>(sender_receivers[i]); + tt::tt_metal::detail::CompileProgram(device, programs.at(device->id())); + auto& cq = tt::tt_metal::detail::GetCommandQueue(device); + + EnqueueProgram(cq, programs.at(device->id()), false); + cqs.emplace_back(cq); + } + for (auto& cq : cqs) { + Finish(cq); + } + + for (uint32_t i = 0; i < sender_receivers.size(); ++i) { + const auto& device = std::get<0>(sender_receivers[i]); + const auto& core = std::get<2>(sender_receivers[i]); + auto readback_vec = llrt::read_hex_vec_from_core( + device->id(), + device->ethernet_core_from_logical_core(core), + src_eth_l1_byte_address, + byte_size_per_device * sender_receivers.size()); + auto a = std::mismatch(full_input.begin(), full_input.end(), readback_vec.begin()); + bool p = (a.first == full_input.end()); + pass &= p; + if (not p) { + log_error(tt::LogTest, "Mismatch on Device {} at Core: {}", device->id(), core.str()); + log_error( + tt::LogTest, "Position: {} Expected: {} Read: {}", a.first - full_input.begin(), *a.first, *a.second); + } + } + + return pass; +} + +bool eth_interleaved_ring_gather_sender_receiver_kernels( + std::vector device_ring, + const BankedConfig& cfg, + const size_t& src_eth_l1_byte_address, + const size_t& dst_eth_l1_byte_address, + const size_t& sem_l1_byte_address, + uint32_t num_bytes_per_send = 16) { + bool pass = true; + const auto& sender_receivers = get_sender_receiver_cores(device_ring); + + // Generate inputs + uint32_t numel = cfg.size_bytes / sizeof(uint32_t); + std::vector> inputs; + inputs.reserve(sender_receivers.size()); + std::vector all_zeros(numel * sender_receivers.size(), 0); + std::map programs; + std::vector full_input; + full_input.reserve(numel * sender_receivers.size()); + + std::vector output_buffers; + output_buffers.reserve(sender_receivers.size()); + + for (uint32_t i = 0; i < sender_receivers.size(); ++i) { + inputs.emplace_back( + tt::test_utils::generate_packed_uniform_random_vector( + -1.0f, 1.0f, cfg.size_bytes / tt::test_utils::df::bfloat16::SIZEOF, i)); + full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end()); + + const auto& device = std::get<0>(sender_receivers[i]); + const auto& eth_sender_core = std::get<2>(sender_receivers[i]); + CoreCoord eth_receiver_core; + for (uint32_t j = 0; j < sender_receivers.size(); ++j) { + if (std::get<1>(sender_receivers[j])->id() == device->id()) { + eth_receiver_core = std::get<3>(sender_receivers[j]); + break; + } + } + + auto& program = programs[device->id()]; + + auto input_buffer = + CreateBuffer(InterleavedBufferConfig{device, cfg.size_bytes, cfg.page_size_bytes, cfg.input_buffer_type}); + bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM; + tt_metal::detail::WriteToBuffer(input_buffer, inputs[i]); + output_buffers.emplace_back(CreateBuffer(InterleavedBufferConfig{ + device, cfg.size_bytes * sender_receivers.size(), cfg.page_size_bytes, cfg.output_buffer_type})); + tt_metal::detail::WriteToBuffer(output_buffers[i], all_zeros); + + auto eth_sender_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_eth_ring_gather_send.cpp", + eth_sender_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::SENDER, + .noc = tt_metal::NOC::NOC_0, + .compile_args = { + uint32_t(num_bytes_per_send), + uint32_t(num_bytes_per_send >> 4), + uint32_t(device->ethernet_core_from_logical_core(eth_receiver_core).x), + uint32_t(device->ethernet_core_from_logical_core(eth_receiver_core).y), + uint32_t(input_buffer.buffer_type() == BufferType::DRAM), + uint32_t(output_buffers[i].buffer_type() == BufferType::DRAM)}}); + + tt_metal::SetRuntimeArgs( + program, + eth_sender_kernel, + eth_sender_core, + {(uint32_t)(src_eth_l1_byte_address), + (uint32_t)dst_eth_l1_byte_address, + (uint32_t)cfg.size_bytes + 32, // + 32 for idx + (uint32_t)sender_receivers.size() - 1, + (uint32_t)(i * cfg.num_pages), + (uint32_t)input_buffer.address(), + (uint32_t)output_buffers[i].address(), + (uint32_t)cfg.num_pages, + (uint32_t)cfg.page_size_bytes, + (uint32_t)sem_l1_byte_address}); + llrt::write_hex_vec_to_core( + device->id(), device->ethernet_core_from_logical_core(eth_sender_core), {INVALID}, sem_l1_byte_address); + + llrt::write_hex_vec_to_core( + device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), {INVALID}, sem_l1_byte_address); + + auto eth_receiver_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_eth_ring_gather_receive.cpp", + eth_receiver_core, + tt_metal::experimental::EthernetConfig{ + .eth_mode = tt_metal::Eth::RECEIVER, + .noc = tt_metal::NOC::NOC_1, + .compile_args = { + uint32_t(device->ethernet_core_from_logical_core(eth_sender_core).x), + uint32_t(device->ethernet_core_from_logical_core(eth_sender_core).y), + uint32_t( + output_buffers[i].buffer_type() == BufferType::DRAM)}}); // probably want to use NOC_1 here + + tt_metal::SetRuntimeArgs( + program, + eth_receiver_kernel, + eth_receiver_core, + {(uint32_t)dst_eth_l1_byte_address, + (uint32_t)cfg.size_bytes + 32, // + 32 for idx + (uint32_t)sender_receivers.size() - 1, + (uint32_t)output_buffers[i].address(), + (uint32_t)cfg.num_pages, + (uint32_t)cfg.page_size_bytes, + (uint32_t)sem_l1_byte_address}); + } + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + std::vector cqs; + for (uint32_t i = 0; i < sender_receivers.size(); ++i) { + const auto& device = std::get<0>(sender_receivers[i]); + tt::tt_metal::detail::CompileProgram(device, programs.at(device->id())); + auto& cq = tt::tt_metal::detail::GetCommandQueue(device); + + EnqueueProgram(cq, programs.at(device->id()), false); + cqs.emplace_back(cq); + } + for (auto& cq : cqs) { + Finish(cq); + } + + for (uint32_t i = 0; i < sender_receivers.size(); ++i) { + const auto& device = std::get<0>(sender_receivers[i]); + const auto& core = std::get<2>(sender_receivers[i]); + std::vector readback_vec; + tt_metal::detail::ReadFromBuffer(output_buffers[i], readback_vec); + auto a = std::mismatch(full_input.begin(), full_input.end(), readback_vec.begin()); + bool p = (a.first == full_input.end()); + pass &= p; + if (not p) { + log_error(tt::LogTest, "Mismatch on Device {} at Core: {}", device->id(), core.str()); + log_error( + tt::LogTest, "Position: {} Expected: {} Read: {}", a.first - full_input.begin(), *a.first, *a.second); + } + } + + return pass; +} +} // namespace fd_unit_tests::erisc::kernels + +TEST_F(CommandQueuePCIDevicesFixture, EthKernelsDirectRingGatherAllChips) { + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; + const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + const auto& device_ring = get_device_ring(devices_); + if (device_ring.empty()) { + GTEST_SKIP(); + } + ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_ring_gather_sender_receiver_kernels( + device_ring, WORD_SIZE, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address)); +} + +TEST_F(CommandQueuePCIDevicesFixture, EthKernelsInterleavedRingGatherAllChips) { + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; + const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + BankedConfig test_config = + BankedConfig{.num_pages = 10, .size_bytes = 10 * 2 * 32 * 32, .page_size_bytes = 2 * 32 * 32}; + const auto& device_ring = get_device_ring(devices_); + if (device_ring.empty()) { + GTEST_SKIP(); + } + ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_interleaved_ring_gather_sender_receiver_kernels( + device_ring, test_config, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address)); +} diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index bf56cae7723..46307658bb3 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -26,8 +26,6 @@ namespace kernel_profiler { uint32_t wIndex __attribute__((used)); } -tt_l1_ptr mailboxes_t * const mailboxes = (tt_l1_ptr mailboxes_t *)(eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE); - uint8_t my_x[NUM_NOCS] __attribute__((used)); uint8_t my_y[NUM_NOCS] __attribute__((used)); @@ -44,7 +42,6 @@ void __attribute__((section("code_l1"))) risc_init() { } void __attribute__((section("erisc_l1_code"))) ApplicationHandler(void) { kernel_profiler::init_profiler(); - kernel_profiler::mark_time(CC_MAIN_START); rtos_context_switch_ptr = (void (*)())RtosTable[0]; risc_init(); @@ -54,14 +51,14 @@ void __attribute__((section("erisc_l1_code"))) ApplicationHandler(void) { noc_local_state_init(n); } ncrisc_noc_full_sync(); - while (1) { - if (erisc_info->num_bytes == 123) { + while (erisc_info->routing_enabled) { + if (erisc_info->launch_user_kernel == 1) { + kernel_profiler::mark_time(CC_MAIN_START); kernel_init(); - break; + kernel_profiler::mark_time(CC_MAIN_END); } else { - risc_context_switch(); + internal_::risc_context_switch(); } } disable_erisc_app(); - kernel_profiler::mark_time(CC_MAIN_END); } diff --git a/tt_metal/hw/firmware/src/erisck.cc b/tt_metal/hw/firmware/src/erisck.cc index 997efd7d847..7cf6e91d75c 100644 --- a/tt_metal/hw/firmware/src/erisck.cc +++ b/tt_metal/hw/firmware/src/erisck.cc @@ -28,13 +28,11 @@ CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; void __attribute__((section("erisc_l1_code"))) kernel_launch() { rtos_context_switch_ptr = (void (*)())RtosTable[0]; - for (uint32_t n = 0; n < NUM_NOCS; n++) { - noc_local_state_init(n); - } - ncrisc_noc_full_sync(); kernel_profiler::mark_time(CC_KERNEL_MAIN_START); kernel_main(); kernel_profiler::mark_time(CC_KERNEL_MAIN_END); - erisc_info->num_bytes = 0; + uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(DISPATCH_CORE_X), NOC_Y(DISPATCH_CORE_Y), DISPATCH_MESSAGE_ADDR); + internal_::notify_dispatch_core_done(dispatch_addr); + erisc_info->launch_user_kernel = 0; } diff --git a/tt_metal/hw/inc/debug/dprint.h b/tt_metal/hw/inc/debug/dprint.h index 78ac69700f9..abb2bad059e 100644 --- a/tt_metal/hw/inc/debug/dprint.h +++ b/tt_metal/hw/inc/debug/dprint.h @@ -182,7 +182,7 @@ DebugPrinter operator <<(DebugPrinter dp, T val) { // buffer is full - wait for the host reader to flush+update rpos while (*dp.rpos() < *dp.wpos()) { #if defined(COMPILE_FOR_ERISC) - risc_context_switch(); + internal_::risc_context_switch(); #endif ; // wait for host to catch up to wpos with it's rpos } diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index cfcd38e9c56..3ea59e775a6 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -20,6 +20,9 @@ #define GET_MAILBOX_ADDRESS_DEV(x) (&(((mailboxes_t tt_l1_ptr *)MEM_MAILBOX_BASE)->x)) #endif +#define GET_ETH_MAILBOX_ADDRESS_HOST(x) \ + ((uint64_t) & (((mailboxes_t *)eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE)->x)) + // Messages for host to tell brisc to go constexpr uint32_t RUN_MSG_INIT = 0x40; constexpr uint32_t RUN_MSG_GO = 0x80; diff --git a/tt_metal/hw/inc/ethernet/dataflow_api.h b/tt_metal/hw/inc/ethernet/dataflow_api.h index e23706fc270..accf4e4f172 100644 --- a/tt_metal/hw/inc/ethernet/dataflow_api.h +++ b/tt_metal/hw/inc/ethernet/dataflow_api.h @@ -15,20 +15,22 @@ inline void RISC_POST_STATUS(uint32_t status) { ptr[0] = status; } struct erisc_info_t { - volatile uint32_t num_bytes; - volatile uint32_t mode; + volatile uint32_t launch_user_kernel; + volatile uint32_t routing_mode; + volatile uint32_t routing_enabled; volatile uint32_t unused_arg0; - volatile uint32_t unused_arg1; - volatile uint32_t bytes_sent; + volatile uint32_t user_buffer_bytes_sent; uint32_t reserved_0_; uint32_t reserved_1_; uint32_t reserved_2_; - volatile uint32_t bytes_received; + volatile uint32_t fast_dispatch_buffer_msgs_sent; uint32_t reserved_3_; uint32_t reserved_4_; uint32_t reserved_5_; }; +tt_l1_ptr mailboxes_t *const mailboxes = (tt_l1_ptr mailboxes_t *)(eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE); + erisc_info_t *erisc_info = (erisc_info_t *)(eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); volatile uint32_t *flag_disable = (uint32_t *)(eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); @@ -38,33 +40,44 @@ volatile uint32_t *RtosTable = void (*rtos_context_switch_ptr)(); +FORCE_INLINE +void reset_erisc_info() { erisc_info->user_buffer_bytes_sent = 0; } + +FORCE_INLINE +void disable_erisc_app() { flag_disable[0] = 0; } + +namespace internal_ { void __attribute__((section("code_l1"))) risc_context_switch() { ncrisc_noc_full_sync(); rtos_context_switch_ptr(); ncrisc_noc_counters_init(); } -FORCE_INLINE -void reset_erisc_info() { - erisc_info->bytes_sent = 0; -} - -FORCE_INLINE -void disable_erisc_app() { flag_disable[0] = 0; } - FORCE_INLINE void check_and_context_switch() { uint32_t start_time = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); uint32_t end_time = start_time; while (end_time - start_time < 100000) { RISC_POST_STATUS(0xdeadCAFE); - risc_context_switch(); + internal_::risc_context_switch(); RISC_POST_STATUS(0xdeadFEAD); end_time = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); } // proceed } +void notify_dispatch_core_done(uint64_t dispatch_addr) { + // flush both nocs because ethernet kernels could be using different nocs to try to atomically increment semaphore + // in dispatch core + for (uint32_t n = 0; n < NUM_NOCS; n++) { + while (!noc_cmd_buf_ready(n, NCRISC_AT_CMD_BUF)) + ; + } + noc_fast_atomic_increment_l1(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, 1, 31 /*wrap*/, false /*linked*/); +} +} // namespace internal_ + + /** * A blocking call that waits until the value of a local L1 memory address on * the Tensix core executing this function becomes equal to a target value. @@ -81,7 +94,7 @@ void check_and_context_switch() { FORCE_INLINE void eth_noc_semaphore_wait(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { while ((*sem_addr) != val) { - risc_context_switch(); + internal_::risc_context_switch(); } } @@ -96,7 +109,7 @@ void eth_noc_semaphore_wait(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) FORCE_INLINE void eth_noc_async_read_barrier() { while (!ncrisc_noc_reads_flushed(noc_index)) { - risc_context_switch(); + internal_::risc_context_switch(); } } @@ -111,7 +124,7 @@ void eth_noc_async_read_barrier() { FORCE_INLINE void eth_noc_async_write_barrier() { while (!ncrisc_noc_nonposted_writes_flushed(noc_index)) { - risc_context_switch(); + internal_::risc_context_switch(); } } @@ -140,7 +153,7 @@ void eth_send_bytes( 0, ((num_bytes_sent + src_addr) >> 4), ((num_bytes_sent + dst_addr) >> 4), num_bytes_per_send_word_size); num_bytes_sent += num_bytes_per_send; } - erisc_info->bytes_sent += num_bytes; + erisc_info->user_buffer_bytes_sent += num_bytes; } /** @@ -154,9 +167,13 @@ void eth_send_bytes( */ FORCE_INLINE void eth_wait_for_receiver_done() { - eth_send_packet(0, ((uint32_t)(&(erisc_info->bytes_sent))) >> 4, ((uint32_t)(&(erisc_info->bytes_sent))) >> 4, 1); - while (erisc_info->bytes_sent != 0) { - risc_context_switch(); + eth_send_packet( + 0, + ((uint32_t)(&(erisc_info->user_buffer_bytes_sent))) >> 4, + ((uint32_t)(&(erisc_info->user_buffer_bytes_sent))) >> 4, + 1); + while (erisc_info->user_buffer_bytes_sent != 0) { + internal_::risc_context_switch(); } } @@ -183,14 +200,14 @@ void eth_wait_for_remote_receiver_done_and_get_local_receiver_data( uint32_t local_eth_l1_curr_src_addr, uint32_t size ) { - eth_send_packet(0, ((uint32_t)(&(erisc_info->bytes_sent))) >> 4, ((uint32_t)(&(erisc_info->bytes_sent))) >> 4, 1); + eth_send_packet(0, ((uint32_t)(&(erisc_info->user_buffer_bytes_sent))) >> 4, ((uint32_t)(&(erisc_info->user_buffer_bytes_sent))) >> 4, 1); eth_noc_semaphore_wait(sender_semaphore_addr_ptr, 1); noc_async_read(receiver_data_noc_addr, local_eth_l1_curr_src_addr, size); eth_noc_async_read_barrier(); noc_semaphore_set(sender_semaphore_addr_ptr, 0); noc_semaphore_inc(receiver_semaphore_noc_addr, 1); - while (erisc_info->bytes_sent != 0) { - risc_context_switch(); + while (erisc_info->user_buffer_bytes_sent != 0) { + internal_::risc_context_switch(); } } /** @@ -207,8 +224,8 @@ void eth_wait_for_remote_receiver_done_and_get_local_receiver_data( */ FORCE_INLINE void eth_wait_for_bytes(uint32_t num_bytes) { - while (erisc_info->bytes_sent != num_bytes) { - risc_context_switch(); + while (erisc_info->user_buffer_bytes_sent != num_bytes) { + internal_::risc_context_switch(); } } @@ -223,6 +240,10 @@ void eth_wait_for_bytes(uint32_t num_bytes) { */ FORCE_INLINE void eth_receiver_done() { - erisc_info->bytes_sent = 0; - eth_send_packet(0, ((uint32_t)(&(erisc_info->bytes_sent))) >> 4, ((uint32_t)(&(erisc_info->bytes_sent))) >> 4, 1); + erisc_info->user_buffer_bytes_sent = 0; + eth_send_packet( + 0, + ((uint32_t)(&(erisc_info->user_buffer_bytes_sent))) >> 4, + ((uint32_t)(&(erisc_info->user_buffer_bytes_sent))) >> 4, + 1); } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index fad59dcf8d3..ace98249d22 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -169,8 +169,10 @@ void Device::initialize_firmware(CoreCoord phys_core, launch_msg_t *launch_msg) int eriscv_id = build_processor_type_to_index(JitBuildProcessorType::ETHERNET).first + 0; ll_api::memory binary_mem = llrt::get_risc_binary(firmware_build_states_[eriscv_id]->get_target_out_path("")); uint32_t kernel_size16 = llrt::get_binary_code_size16(binary_mem, eriscv_id); + llrt::write_hex_vec_to_core(this->id(), phys_core, {1}, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE + 8); log_debug(LogDevice, "ERISC fw binary size: {} in bytes", kernel_size16 * 16); llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, eriscv_id); + llrt::launch_erisc_app_fw_on_core(this->id(), phys_core); } else { llrt::program_brisc_startup_addr(this->id(), phys_core); for (int riscv_id = 0; riscv_id < 5; riscv_id++) { diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 7aaf4f65804..9c0ebb272d9 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -32,18 +32,48 @@ ProgramMap ConstructProgramMap(const Device* device, Program& program) { /* TODO(agrebenisan): Move this logic to compile program */ - vector runtime_arg_page_transfers; - vector cb_config_page_transfers; - vector program_page_transfers; - vector go_signal_page_transfers; - vector num_transfers_in_runtime_arg_pages; // Corresponds to the number of transfers within host data pages across all host data pages - vector num_transfers_in_cb_config_pages; - vector num_transfers_in_program_pages; - vector num_transfers_in_go_signal_pages; + std::unordered_map> program_page_transfers = { + {PageTransferType::MULTICAST, {}}, {PageTransferType::UNICAST, {}}}; + std::unordered_map> runtime_arg_page_transfers = { + {PageTransferType::MULTICAST, {}}, {PageTransferType::UNICAST, {}}}; + std::unordered_map> cb_config_page_transfers = { + {PageTransferType::MULTICAST, {}}, {PageTransferType::UNICAST, {}}}; + std::unordered_map> go_signal_page_transfers = { + {PageTransferType::MULTICAST, {}}, {PageTransferType::UNICAST, {}}}; + std::unordered_map> num_transfers_in_program_pages = { + {PageTransferType::MULTICAST, {}}, {PageTransferType::UNICAST, {}}}; + std::unordered_map> num_transfers_in_runtime_arg_pages = { + {PageTransferType::MULTICAST, {}}, + {PageTransferType::UNICAST, + {}}}; // Corresponds to the number of transfers within host data pages across all host data pages + std::unordered_map> num_transfers_in_cb_config_pages = { + {PageTransferType::MULTICAST, {}}, {PageTransferType::UNICAST, {}}}; + std::unordered_map> num_transfers_in_go_signal_pages = { + {PageTransferType::MULTICAST, {}}, {PageTransferType::UNICAST, {}}}; + + static const map processor_to_local_mem_addr = { + {RISCV::BRISC, MEM_BRISC_INIT_LOCAL_L1_BASE}, + {RISCV::NCRISC, MEM_NCRISC_INIT_LOCAL_L1_BASE}, + {RISCV::TRISC0, MEM_TRISC0_INIT_LOCAL_L1_BASE}, + {RISCV::TRISC1, MEM_TRISC1_INIT_LOCAL_L1_BASE}, + {RISCV::TRISC2, MEM_TRISC2_INIT_LOCAL_L1_BASE}, + {RISCV::ERISC, eth_l1_mem::address_map::FIRMWARE_BASE}}; + + static const map processor_to_l1_arg_base_addr = { + {RISCV::BRISC, BRISC_L1_ARG_BASE}, + {RISCV::NCRISC, NCRISC_L1_ARG_BASE}, + {RISCV::COMPUTE, TRISC_L1_ARG_BASE}, + {RISCV::ERISC, eth_l1_mem::address_map::ERISC_L1_ARG_BASE}, + }; + uint32_t num_transfers_within_page = 0; uint32_t src = 0; + vector program_pages; + uint32_t program_page_idx = 0; + uint32_t program_new_page_tracker = 0; constexpr static uint32_t noc_transfer_alignment_in_bytes = 16; + auto update_program_page_transfers = [&num_transfers_within_page]( uint32_t src, uint32_t num_bytes, @@ -78,12 +108,13 @@ ProgramMap ConstructProgramMap(const Device* device, Program& program) { return src; }; - auto extract_dst_noc_multicast_info = [&device](const set& ranges) -> vector> { + auto extract_dst_noc_multicast_info = + [&device](const set& ranges, const CoreType core_type) -> vector> { // This API extracts all the pairs of noc multicast encodings given a set of core ranges vector> dst_noc_multicast_info; for (const CoreRange& core_range : ranges) { - CoreCoord physical_start = device->worker_core_from_logical_core(core_range.start); - CoreCoord physical_end = device->worker_core_from_logical_core(core_range.end); + CoreCoord physical_start = device->physical_core_from_logical_core(core_range.start, core_type); + CoreCoord physical_end = device->physical_core_from_logical_core(core_range.end, core_type); uint32_t dst_noc_multicast_encoding = get_noc_multicast_encoding(physical_start, physical_end); @@ -93,11 +124,117 @@ ProgramMap ConstructProgramMap(const Device* device, Program& program) { return dst_noc_multicast_info; }; - static const map processor_to_l1_arg_base_addr = { - {RISCV::BRISC, BRISC_L1_ARG_BASE}, - {RISCV::NCRISC, NCRISC_L1_ARG_BASE}, - {RISCV::COMPUTE, TRISC_L1_ARG_BASE}, + auto update_program_pages_with_new_page = [&program_pages, &src, &program_new_page_tracker]() { + program_pages.resize(program_pages.size() + align(src, DeviceCommand::PROGRAM_PAGE_SIZE) / sizeof(uint32_t), 0); + src = 0; + program_new_page_tracker++; + }; + auto align_program_page_idx_to_new_page = [&program_page_idx, &program_new_page_tracker]() { + program_page_idx = align(program_page_idx, DeviceCommand::PROGRAM_PAGE_SIZE / sizeof(uint32_t)); + program_new_page_tracker--; + }; + + auto update_program_page_for_kernel_group = + [&program_page_transfers, + &num_transfers_in_program_pages, + &update_program_page_transfers, + &extract_dst_noc_multicast_info, + &device, + &program](uint32_t src, const KernelGroup& kernel_group, PageTransferType page_transfer_type) -> uint32_t { + vector> dst_noc_multicast_info = + extract_dst_noc_multicast_info(kernel_group.core_ranges.ranges(), kernel_group.get_core_type()); + + // So far, we don't support linking optimizations for kernel groups + // which use multiple core ranges + bool linked = dst_noc_multicast_info.size() == 1; + + vector kernel_ids; + if (kernel_group.riscv0_id) + kernel_ids.push_back(kernel_group.riscv0_id.value()); + if (kernel_group.riscv1_id) + kernel_ids.push_back(kernel_group.riscv1_id.value()); + if (kernel_group.compute_id) + kernel_ids.push_back(kernel_group.compute_id.value()); + if (kernel_group.erisc_id) + kernel_ids.push_back(kernel_group.erisc_id.value()); + + for (size_t i = 0; i < kernel_ids.size(); i++) { + KernelHandle kernel_id = kernel_ids[i]; + vector sub_kernels; + const Kernel* kernel = detail::GetKernel(program, kernel_id); + if (kernel->processor() == RISCV::COMPUTE) { + sub_kernels = {RISCV::TRISC0, RISCV::TRISC1, RISCV::TRISC2}; + } else { + sub_kernels = {kernel->processor()}; + } + + uint32_t sub_kernel_index = 0; + const auto& binaries = kernel->binaries(device->id()); + for (size_t j = 0; j < binaries.size(); j++) { + const ll_api::memory& kernel_bin = binaries[j]; + + uint32_t k = 0; + uint32_t num_spans = kernel_bin.num_spans(); + kernel_bin.process_spans([&](vector::const_iterator mem_ptr, uint64_t dst, uint32_t len) { + linked &= (i != kernel_ids.size() - 1) or (j != binaries.size() - 1) or (k != num_spans - 1); + uint64_t relo_addr = + tt::llrt::relocate_dev_addr(dst, processor_to_local_mem_addr.at(sub_kernels[sub_kernel_index])); + + uint32_t num_bytes = len * sizeof(uint32_t); + + if (page_transfer_type == PageTransferType::UNICAST) { + for (const auto& logical_core : kernel->logical_cores()) { + uint32_t dst_noc = get_noc_unicast_encoding( + device->physical_core_from_logical_core(logical_core, kernel_group.get_core_type())); + src = update_program_page_transfers( + src, + num_bytes, + relo_addr, + program_page_transfers.at(PageTransferType::UNICAST), + num_transfers_in_program_pages.at(PageTransferType::UNICAST), + {{dst_noc, 1}}); + } + } else if (page_transfer_type == PageTransferType::MULTICAST) { + src = update_program_page_transfers( + src, + num_bytes, + relo_addr, + program_page_transfers.at(PageTransferType::MULTICAST), + num_transfers_in_program_pages.at(PageTransferType::MULTICAST), + dst_noc_multicast_info, + linked); + } + k++; + }); + sub_kernel_index++; + } + } + return src; }; + auto populate_program_binaries_pages = + [&program_pages, &program_page_idx, &device, &program](const KernelGroup& kernel_group) { + vector kernel_ids; + if (kernel_group.riscv0_id) + kernel_ids.push_back(kernel_group.riscv0_id.value()); + if (kernel_group.riscv1_id) + kernel_ids.push_back(kernel_group.riscv1_id.value()); + if (kernel_group.compute_id) + kernel_ids.push_back(kernel_group.compute_id.value()); + if (kernel_group.erisc_id) + kernel_ids.push_back(kernel_group.erisc_id.value()); + for (KernelHandle kernel_id : kernel_ids) { + const Kernel* kernel = detail::GetKernel(program, kernel_id); + + for (const ll_api::memory& kernel_bin : kernel->binaries(device->id())) { + kernel_bin.process_spans([&](vector::const_iterator mem_ptr, uint64_t dst, uint32_t len) { + std::copy(mem_ptr, mem_ptr + len, program_pages.begin() + program_page_idx); + + program_page_idx = + align(program_page_idx + len, noc_transfer_alignment_in_bytes / sizeof(uint32_t)); + }); + } + } + }; // Step 1: Get transfer info for runtime args (soon to just be host data). We // want to send host data first because of the higher latency to pull @@ -105,168 +242,163 @@ ProgramMap ConstructProgramMap(const Device* device, Program& program) { for (size_t kernel_id = 0; kernel_id < program.num_kernels(); kernel_id++) { Kernel* kernel = detail::GetKernel(program, kernel_id); uint32_t dst = processor_to_l1_arg_base_addr.at(kernel->processor()); - for (const auto &core_coord : kernel->cores_with_runtime_args()) { - CoreCoord physical_core = device->worker_core_from_logical_core(core_coord); - const auto & runtime_args = kernel->runtime_args(core_coord); + const auto& kernel_core_type = kernel->get_kernel_core_type(); + for (const auto& core_coord : kernel->cores_with_runtime_args()) { + CoreCoord physical_core = + device->physical_core_from_logical_core(core_coord, kernel->get_kernel_core_type()); + const auto& runtime_args = kernel->runtime_args(core_coord); uint32_t num_bytes = runtime_args.size() * sizeof(uint32_t); uint32_t dst_noc = get_noc_unicast_encoding(physical_core); // Only one receiver per set of runtime arguments src = update_program_page_transfers( - src, num_bytes, dst, runtime_arg_page_transfers, num_transfers_in_runtime_arg_pages, {{dst_noc, 1}}); + src, + num_bytes, + dst, + runtime_arg_page_transfers.at(PageTransferType::MULTICAST), + num_transfers_in_runtime_arg_pages.at(PageTransferType::MULTICAST), + {{dst_noc, 1}}); } } // Cleanup step of separating runtime arg pages from program pages if (num_transfers_within_page) { - num_transfers_in_runtime_arg_pages.push_back(num_transfers_within_page); + num_transfers_in_runtime_arg_pages.at(PageTransferType::MULTICAST).push_back(num_transfers_within_page); num_transfers_within_page = 0; } - src = 0; // Resetting since in a new page + src = 0; // Resetting since in a new page // Step 2: Continue constructing pages for circular buffer configs for (const shared_ptr& cb : program.circular_buffers()) { - vector> dst_noc_multicast_info = extract_dst_noc_multicast_info(cb->core_ranges().ranges()); + // No CB support for ethernet cores + vector> dst_noc_multicast_info = + extract_dst_noc_multicast_info(cb->core_ranges().ranges(), CoreType::WORKER); constexpr static uint32_t num_bytes = UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); for (const auto buffer_index : cb->buffer_indices()) { src = update_program_page_transfers( src, num_bytes, CIRCULAR_BUFFER_CONFIG_BASE + buffer_index * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t), - cb_config_page_transfers, - num_transfers_in_cb_config_pages, + cb_config_page_transfers.at(PageTransferType::MULTICAST), + num_transfers_in_cb_config_pages.at(PageTransferType::MULTICAST), dst_noc_multicast_info); } } // Cleanup step of separating runtime arg pages from program pages if (num_transfers_within_page) { - num_transfers_in_cb_config_pages.push_back(num_transfers_within_page); + num_transfers_in_cb_config_pages.at(PageTransferType::MULTICAST).push_back(num_transfers_within_page); num_transfers_within_page = 0; } - static const map processor_to_local_mem_addr = { - {RISCV::BRISC, MEM_BRISC_INIT_LOCAL_L1_BASE}, - {RISCV::NCRISC, MEM_NCRISC_INIT_LOCAL_L1_BASE}, - {RISCV::TRISC0, MEM_TRISC0_INIT_LOCAL_L1_BASE}, - {RISCV::TRISC1, MEM_TRISC1_INIT_LOCAL_L1_BASE}, - {RISCV::TRISC2, MEM_TRISC2_INIT_LOCAL_L1_BASE}}; - - // Step 3: Determine the transfer information for each program binary - src = 0; // Restart src since it begins in a new page - for (const KernelGroup &kg: program.get_kernel_groups()) { - - vector> dst_noc_multicast_info = - extract_dst_noc_multicast_info(kg.core_ranges.ranges()); - - // So far, we don't support linking optimizations for kernel groups - // which use multiple core ranges - bool linked = dst_noc_multicast_info.size() == 1; - - vector kernel_ids; - if (kg.riscv0_id) kernel_ids.push_back(kg.riscv0_id.value()); - if (kg.riscv1_id) kernel_ids.push_back(kg.riscv1_id.value()); - if (kg.compute_id) kernel_ids.push_back(kg.compute_id.value()); - - uint32_t src_copy = src; - for (size_t i = 0; i < kernel_ids.size(); i++) { - KernelHandle kernel_id = kernel_ids[i]; - vector sub_kernels; - const Kernel* kernel = detail::GetKernel(program, kernel_id); - if (kernel->processor() == RISCV::COMPUTE) { - sub_kernels = {RISCV::TRISC0, RISCV::TRISC1, RISCV::TRISC2}; - } else { - sub_kernels = {kernel->processor()}; - } - - uint32_t sub_kernel_index = 0; - const auto& binaries = kernel->binaries(device->id()); - for (size_t j = 0; j < binaries.size(); j++) { - const ll_api::memory& kernel_bin = binaries[j]; - - uint32_t k = 0; - uint32_t num_spans = kernel_bin.num_spans(); - kernel_bin.process_spans([&](vector::const_iterator mem_ptr, uint64_t dst, uint32_t len) { - linked &= (i != kernel_ids.size() - 1) or (j != binaries.size() - 1) or (k != num_spans - 1); - - uint32_t num_bytes = len * sizeof(uint32_t); - if ((dst & MEM_LOCAL_BASE) == MEM_LOCAL_BASE) { - dst = (dst & ~MEM_LOCAL_BASE) + processor_to_local_mem_addr.at(sub_kernels[sub_kernel_index]); - } else if ((dst & MEM_NCRISC_IRAM_BASE) == MEM_NCRISC_IRAM_BASE) { - dst = (dst & ~MEM_NCRISC_IRAM_BASE) + MEM_NCRISC_INIT_IRAM_L1_BASE; - } - - src = update_program_page_transfers( - src, num_bytes, dst, program_page_transfers, num_transfers_in_program_pages, dst_noc_multicast_info, linked); - k++; - }); - sub_kernel_index++; - } + // Split kernel groups by multicast/unicast, program multicast transfers first then unicast + std::vector kernel_group_multicast; + std::vector kernel_group_unicast; + for (const KernelGroup& kernel_group : program.get_kernel_groups()) { + if (kernel_group.get_core_type() == CoreType::WORKER) { + kernel_group_multicast.emplace_back(kernel_group); + } else if (kernel_group.get_core_type() == CoreType::ETH) { + kernel_group_unicast.emplace_back(kernel_group); + } else { + TT_ASSERT(false, "Constructing command for unsupported core type"); } } - - // Step 4: Continue constructing pages for semaphore configs + // Enqueue program binaries and go siggals in this order: + // - Multicast Program Binaries + // - Unicast Program Binaries + // - Multicast Go Signals + // - Unicast Go Signals + // This probably has better perf than sending binaries and go signals together: + // - Multicast Program Binaries + // - Multicast Go Signals + // - Unicast Program Binaries + // - Unicast Go Signals + // Step 3a (Multicast): Determine the transfer information for each program binary + src = 0; // Restart src since multicast program binaries begins in a new page + for (const KernelGroup& kernel_group : kernel_group_multicast) { + src = update_program_page_for_kernel_group(src, kernel_group, PageTransferType::MULTICAST); + } + // Step 4 (Multicast): Continue constructing pages for semaphore configs, only multicast/worker cores supported for (const Semaphore& semaphore : program.semaphores()) { vector> dst_noc_multicast_info = - extract_dst_noc_multicast_info(semaphore.core_range_set().ranges()); + extract_dst_noc_multicast_info(semaphore.core_range_set().ranges(), CoreType::WORKER); src = update_program_page_transfers( src, L1_ALIGNMENT, semaphore.address(), - program_page_transfers, - num_transfers_in_program_pages, + program_page_transfers.at(PageTransferType::MULTICAST), + num_transfers_in_program_pages.at(PageTransferType::MULTICAST), dst_noc_multicast_info); } if (num_transfers_within_page) { - num_transfers_in_program_pages.push_back(num_transfers_within_page); + num_transfers_in_program_pages.at(PageTransferType::MULTICAST).push_back(num_transfers_within_page); num_transfers_within_page = 0; } - vector program_pages(align(src, DeviceCommand::PROGRAM_PAGE_SIZE) / sizeof(uint32_t), 0); + // Step 3b (Unicast) + // skipping step 4 since no semaphore support + update_program_pages_with_new_page(); // sets src to 0 since unicast program binaries begins in new page + for (const KernelGroup& kernel_group : kernel_group_unicast) { + src = update_program_page_for_kernel_group(src, kernel_group, PageTransferType::UNICAST); + } + if (num_transfers_within_page) { + num_transfers_in_program_pages.at(PageTransferType::UNICAST).push_back(num_transfers_within_page); + num_transfers_within_page = 0; + } - // Step 5: Continue constructing pages for GO signals - src = 0; - for (KernelGroup& kg : program.get_kernel_groups()) { - kg.launch_msg.mode = DISPATCH_MODE_DEV; + // Step 5a (Multicast): Continue constructing pages for GO signals, multicast first then unicast + update_program_pages_with_new_page(); // sets src to 0 since multicast signals begins in new page + for (KernelGroup& kernel_group : kernel_group_multicast) { + kernel_group.launch_msg.mode = DISPATCH_MODE_DEV; vector> dst_noc_multicast_info = - extract_dst_noc_multicast_info(kg.core_ranges.ranges()); - + extract_dst_noc_multicast_info(kernel_group.core_ranges.ranges(), kernel_group.get_core_type()); src = update_program_page_transfers( src, sizeof(launch_msg_t), GET_MAILBOX_ADDRESS_HOST(launch), - go_signal_page_transfers, - num_transfers_in_go_signal_pages, - dst_noc_multicast_info - ); + go_signal_page_transfers.at(PageTransferType::MULTICAST), + num_transfers_in_go_signal_pages.at(PageTransferType::MULTICAST), + dst_noc_multicast_info); + } + if (num_transfers_within_page) { + num_transfers_in_go_signal_pages.at(PageTransferType::MULTICAST).push_back(num_transfers_within_page); + num_transfers_within_page = 0; } + // Step 5b (Unicast) + update_program_pages_with_new_page(); // sets src to 0 since unicast signals begins in new page + for (const KernelGroup& kernel_group : kernel_group_unicast) { + if (kernel_group.get_core_type() == CoreType::ETH) { + const Kernel* kernel = detail::GetKernel(program, kernel_group.erisc_id.value()); + for (const auto& logical_eth_core : kernel->logical_cores()) { + uint32_t dst_noc = + get_noc_unicast_encoding(device->physical_core_from_logical_core(logical_eth_core, CoreType::ETH)); + src = update_program_page_transfers( + src, + sizeof(uint32_t), + eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE, + go_signal_page_transfers.at(PageTransferType::UNICAST), + num_transfers_in_go_signal_pages.at(PageTransferType::UNICAST), + {{dst_noc, 1}}); + } + } else { + TT_ASSERT(false, "All non-ethernet core go signals should be muticasted"); + } + } if (num_transfers_within_page) { - num_transfers_in_go_signal_pages.push_back(num_transfers_within_page); + num_transfers_in_go_signal_pages.at(PageTransferType::UNICAST).push_back(num_transfers_within_page); + num_transfers_within_page = 0; } // Allocate some more space for GO signal - program_pages.resize(program_pages.size() + align(src, DeviceCommand::PROGRAM_PAGE_SIZE) / sizeof(uint32_t)); + update_program_pages_with_new_page(); // sets src to 0, but not needed // Create a vector of all program binaries/cbs/semaphores - uint32_t program_page_idx = 0; - for (const KernelGroup &kg: program.get_kernel_groups()) { - vector kernel_ids; - if (kg.riscv0_id) kernel_ids.push_back(kg.riscv0_id.value()); - if (kg.riscv1_id) kernel_ids.push_back(kg.riscv1_id.value()); - if (kg.compute_id) kernel_ids.push_back(kg.compute_id.value()); - for (KernelHandle kernel_id: kernel_ids) { - const Kernel* kernel = detail::GetKernel(program, kernel_id); - - for (const ll_api::memory& kernel_bin : kernel->binaries(device->id())) { - kernel_bin.process_spans([&](vector::const_iterator mem_ptr, uint64_t dst, uint32_t len) { - std::copy(mem_ptr, mem_ptr + len, program_pages.begin() + program_page_idx); - program_page_idx = align(program_page_idx + len, noc_transfer_alignment_in_bytes / sizeof(uint32_t)); - }); - } - } + align_program_page_idx_to_new_page(); + for (const KernelGroup& kernel_group : kernel_group_multicast) { + populate_program_binaries_pages(kernel_group); } for (const Semaphore& semaphore : program.semaphores()) { @@ -274,28 +406,50 @@ ProgramMap ConstructProgramMap(const Device* device, Program& program) { program_page_idx += 4; } - // Since GO signal begin in a new page, I need to advance my idx - program_page_idx = align(program_page_idx, DeviceCommand::PROGRAM_PAGE_SIZE / sizeof(uint32_t)); + align_program_page_idx_to_new_page(); + for (const KernelGroup& kernel_group : kernel_group_unicast) { + populate_program_binaries_pages(kernel_group); + } + // Since GO signal begin in a new page, I need to advance my idx + align_program_page_idx_to_new_page(); // uint32_t dispatch_core_word = ((uint32_t)dispatch_core.y << 16) | dispatch_core.x; - for (KernelGroup& kg: program.get_kernel_groups()) { + for (KernelGroup& kernel_group : kernel_group_multicast) { // TODO(agrebenisan): Hanging when we extend the launch msg. Needs to be investigated. For now, // only supporting enqueue program for cq 0 on a device. - // kg.launch_msg.dispatch_core_x = dispatch_core.x; - // kg.launch_msg.dispatch_core_y = dispatch_core.y; + // kernel_group.launch_msg.dispatch_core_x = dispatch_core.x; + // kernel_group.launch_msg.dispatch_core_y = dispatch_core.y; static_assert(sizeof(launch_msg_t) % sizeof(uint32_t) == 0); - uint32_t *launch_message_data = (uint32_t *)&kg.launch_msg; + uint32_t* launch_message_data = (uint32_t*)&kernel_group.launch_msg; for (int i = 0; i < sizeof(launch_msg_t) / sizeof(uint32_t); i++) { program_pages[program_page_idx + i] = launch_message_data[i]; } program_page_idx += sizeof(launch_msg_t) / sizeof(uint32_t); } + align_program_page_idx_to_new_page(); + for (KernelGroup& kernel_group : kernel_group_unicast) { + if (kernel_group.get_core_type() == CoreType::ETH) { + const Kernel* kernel = detail::GetKernel(program, kernel_group.erisc_id.value()); + for (const auto& logical_eth_core : kernel->logical_cores()) { + program_pages[program_page_idx] = 1; + program_page_idx += 4; // 16 byte L1 alignment + } + } else { + TT_ASSERT(false, "All non-ethernet core go signals should be muticasted"); + } + } + + TT_ASSERT( + program_new_page_tracker == 0, "Number of new program pages not aligned between sizing and populating data."); + uint32_t num_workers = 0; + // Explicitly sum the worker and eth cores, since we don't have support for all core types if (program.logical_cores().find(CoreType::WORKER) != program.logical_cores().end()) { - num_workers = program.logical_cores().at(CoreType::WORKER).size(); + num_workers += program.logical_cores().at(CoreType::WORKER).size(); + } else if (program.logical_cores().find(CoreType::ETH) != program.logical_cores().end()) { + num_workers += program.logical_cores().at(CoreType::ETH).size(); } - return { .num_workers = num_workers, .program_pages = std::move(program_pages), @@ -640,19 +794,30 @@ const DeviceCommand EnqueueProgramCommand::assemble_device_command(uint32_t host // info constexpr static uint32_t dummy_dst_addr = 0; constexpr static uint32_t dummy_buffer_type = 0; - uint32_t num_runtime_arg_pages = this->program_to_dev_map.num_transfers_in_runtime_arg_pages.size(); - uint32_t num_cb_config_pages = this->program_to_dev_map.num_transfers_in_cb_config_pages.size(); - uint32_t num_program_binary_pages = this->program_to_dev_map.num_transfers_in_program_pages.size(); - uint32_t num_go_signal_pages = this->program_to_dev_map.num_transfers_in_go_signal_pages.size(); + uint32_t num_runtime_arg_pages = + this->program_to_dev_map.num_transfers_in_runtime_arg_pages.at(PageTransferType::MULTICAST).size(); + uint32_t num_cb_config_pages = + this->program_to_dev_map.num_transfers_in_cb_config_pages.at(PageTransferType::MULTICAST).size(); + uint32_t num_program_multicast_binary_pages = + this->program_to_dev_map.num_transfers_in_program_pages.at(PageTransferType::MULTICAST).size(); + uint32_t num_program_unicast_binary_pages = + this->program_to_dev_map.num_transfers_in_program_pages.at(PageTransferType::UNICAST).size(); + uint32_t num_go_signal_multicast_pages = + this->program_to_dev_map.num_transfers_in_go_signal_pages.at(PageTransferType::MULTICAST).size(); + uint32_t num_go_signal_unicast_pages = + this->program_to_dev_map.num_transfers_in_go_signal_pages.at(PageTransferType::UNICAST).size(); uint32_t num_host_data_pages = num_runtime_arg_pages + num_cb_config_pages; - uint32_t num_cached_pages = num_program_binary_pages + num_go_signal_pages; + uint32_t num_cached_pages = num_program_multicast_binary_pages + num_go_signal_multicast_pages + + num_program_unicast_binary_pages + num_go_signal_unicast_pages; uint32_t total_num_pages = num_host_data_pages + num_cached_pages; command.set_page_size(DeviceCommand::PROGRAM_PAGE_SIZE); command.set_num_pages(DeviceCommand::TransferType::RUNTIME_ARGS, num_runtime_arg_pages); command.set_num_pages(DeviceCommand::TransferType::CB_CONFIGS, num_cb_config_pages); - command.set_num_pages(DeviceCommand::TransferType::PROGRAM_PAGES, num_program_binary_pages); - command.set_num_pages(DeviceCommand::TransferType::GO_SIGNALS, num_go_signal_pages); + command.set_num_pages(DeviceCommand::TransferType::PROGRAM_MULTICAST_PAGES, num_program_multicast_binary_pages); + command.set_num_pages(DeviceCommand::TransferType::PROGRAM_UNICAST_PAGES, num_program_unicast_binary_pages); + command.set_num_pages(DeviceCommand::TransferType::GO_SIGNALS_MULTICAST, num_go_signal_multicast_pages); + command.set_num_pages(DeviceCommand::TransferType::GO_SIGNALS_UNICAST, num_go_signal_unicast_pages); command.set_num_pages(total_num_pages); command.set_data_size( @@ -671,12 +836,14 @@ const DeviceCommand EnqueueProgramCommand::assemble_device_command(uint32_t host if (num_runtime_arg_pages) { populate_program_data_transfer_instructions( - this->program_to_dev_map.num_transfers_in_runtime_arg_pages, this->program_to_dev_map.runtime_arg_page_transfers); + this->program_to_dev_map.num_transfers_in_runtime_arg_pages.at(PageTransferType::MULTICAST), + this->program_to_dev_map.runtime_arg_page_transfers.at(PageTransferType::MULTICAST)); } if (num_cb_config_pages) { populate_program_data_transfer_instructions( - this->program_to_dev_map.num_transfers_in_cb_config_pages, this->program_to_dev_map.cb_config_page_transfers); + this->program_to_dev_map.num_transfers_in_cb_config_pages.at(PageTransferType::MULTICAST), + this->program_to_dev_map.cb_config_page_transfers.at(PageTransferType::MULTICAST)); } } @@ -689,14 +856,27 @@ const DeviceCommand EnqueueProgramCommand::assemble_device_command(uint32_t host uint32_t(this->buffer.buffer_type()), dummy_buffer_type, page_index_offset, page_index_offset); - if (num_program_binary_pages) { + if (num_program_multicast_binary_pages) { populate_program_data_transfer_instructions( - this->program_to_dev_map.num_transfers_in_program_pages, this->program_to_dev_map.program_page_transfers); + this->program_to_dev_map.num_transfers_in_program_pages.at(PageTransferType::MULTICAST), + this->program_to_dev_map.program_page_transfers.at(PageTransferType::MULTICAST)); } - if (num_go_signal_pages) { + if (num_program_unicast_binary_pages) { + populate_program_data_transfer_instructions( + this->program_to_dev_map.num_transfers_in_program_pages.at(PageTransferType::UNICAST), + this->program_to_dev_map.program_page_transfers.at(PageTransferType::UNICAST)); + } + + if (num_go_signal_multicast_pages) { + populate_program_data_transfer_instructions( + this->program_to_dev_map.num_transfers_in_go_signal_pages.at(PageTransferType::MULTICAST), + this->program_to_dev_map.go_signal_page_transfers.at(PageTransferType::MULTICAST)); + } + if (num_go_signal_unicast_pages) { populate_program_data_transfer_instructions( - this->program_to_dev_map.num_transfers_in_go_signal_pages, this->program_to_dev_map.go_signal_page_transfers); + this->program_to_dev_map.num_transfers_in_go_signal_pages.at(PageTransferType::UNICAST), + this->program_to_dev_map.go_signal_page_transfers.at(PageTransferType::UNICAST)); } } diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index 7bcebb57d62..10a4d45f921 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -36,17 +36,19 @@ struct transfer_info { bool linked; }; +enum class PageTransferType { MULTICAST, UNICAST }; + struct ProgramMap { uint32_t num_workers; vector program_pages; - vector program_page_transfers; - vector runtime_arg_page_transfers; - vector cb_config_page_transfers; - vector go_signal_page_transfers; - vector num_transfers_in_program_pages; - vector num_transfers_in_runtime_arg_pages; - vector num_transfers_in_cb_config_pages; - vector num_transfers_in_go_signal_pages; + std::unordered_map> program_page_transfers; + std::unordered_map> runtime_arg_page_transfers; + std::unordered_map> cb_config_page_transfers; + std::unordered_map> go_signal_page_transfers; + std::unordered_map> num_transfers_in_program_pages; + std::unordered_map> num_transfers_in_runtime_arg_pages; + std::unordered_map> num_transfers_in_cb_config_pages; + std::unordered_map> num_transfers_in_go_signal_pages; }; // Only contains the types of commands which are enqueued onto the device diff --git a/tt_metal/impl/dispatch/device_command.cpp b/tt_metal/impl/dispatch/device_command.cpp index 29cde53ab33..603be519272 100644 --- a/tt_metal/impl/dispatch/device_command.cpp +++ b/tt_metal/impl/dispatch/device_command.cpp @@ -57,11 +57,17 @@ void DeviceCommand::set_num_pages(const DeviceCommand::TransferType transfer_typ case DeviceCommand::TransferType::CB_CONFIGS: this->packet.header.num_cb_config_pages = num_pages; break; - case DeviceCommand::TransferType::PROGRAM_PAGES: - this->packet.header.num_program_pages = num_pages; + case DeviceCommand::TransferType::PROGRAM_MULTICAST_PAGES: + this->packet.header.num_program_multicast_pages = num_pages; break; - case DeviceCommand::TransferType::GO_SIGNALS: - this->packet.header.num_go_signal_pages = num_pages; + case DeviceCommand::TransferType::PROGRAM_UNICAST_PAGES: + this->packet.header.num_program_unicast_pages = num_pages; + break; + case DeviceCommand::TransferType::GO_SIGNALS_MULTICAST: + this->packet.header.num_go_signal_multicast_pages = num_pages; + break; + case DeviceCommand::TransferType::GO_SIGNALS_UNICAST: + this->packet.header.num_go_signal_unicast_pages = num_pages; break; default: TT_ASSERT(false, "Invalid transfer type."); diff --git a/tt_metal/impl/dispatch/device_command.hpp b/tt_metal/impl/dispatch/device_command.hpp index 1b94a1bc5f6..982d60efcac 100644 --- a/tt_metal/impl/dispatch/device_command.hpp +++ b/tt_metal/impl/dispatch/device_command.hpp @@ -23,8 +23,10 @@ struct CommandHeader { uint32_t num_pages = 0; uint32_t num_runtime_arg_pages = 0; uint32_t num_cb_config_pages = 0; - uint32_t num_program_pages = 0; - uint32_t num_go_signal_pages = 0; + uint32_t num_program_multicast_pages = 0; + uint32_t num_program_unicast_pages = 0; + uint32_t num_go_signal_multicast_pages = 0; + uint32_t num_go_signal_unicast_pages = 0; uint32_t data_size = 0; uint32_t producer_consumer_transfer_num_pages = 0; uint32_t buffer_type = 0; @@ -38,7 +40,15 @@ class DeviceCommand { public: DeviceCommand(); - enum class TransferType : uint8_t { RUNTIME_ARGS, CB_CONFIGS, PROGRAM_PAGES, GO_SIGNALS, NUM_TRANSFER_TYPES }; + enum class TransferType : uint8_t { + RUNTIME_ARGS, + CB_CONFIGS, + PROGRAM_MULTICAST_PAGES, + PROGRAM_UNICAST_PAGES, + GO_SIGNALS_MULTICAST, + GO_SIGNALS_UNICAST, + NUM_TRANSFER_TYPES + }; // Constants //TODO: investigate other num_cores diff --git a/tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp b/tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp index bd8399fe987..c39fa7ad01f 100644 --- a/tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp +++ b/tt_metal/impl/dispatch/kernels/command_queue_consumer.hpp @@ -287,11 +287,19 @@ void write_and_launch_program( case (uint32_t) DeviceCommand::TransferType::CB_CONFIGS: num_pages_in_transfer = header->num_cb_config_pages; break; - case (uint32_t) DeviceCommand::TransferType::PROGRAM_PAGES: - num_pages_in_transfer = header->num_program_pages; + case (uint32_t) DeviceCommand::TransferType::PROGRAM_MULTICAST_PAGES: + num_pages_in_transfer = header->num_program_multicast_pages; break; - case (uint32_t) DeviceCommand::TransferType::GO_SIGNALS: - num_pages_in_transfer = header->num_go_signal_pages; + case (uint32_t) DeviceCommand::TransferType::PROGRAM_UNICAST_PAGES: + multicast = false; + num_pages_in_transfer = header->num_program_unicast_pages; + break; + case (uint32_t) DeviceCommand::TransferType::GO_SIGNALS_MULTICAST: + num_pages_in_transfer = header->num_go_signal_multicast_pages; + break; + case (uint32_t) DeviceCommand::TransferType::GO_SIGNALS_UNICAST: + multicast = false; + num_pages_in_transfer = header->num_go_signal_unicast_pages; break; } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 4344ebfe457..13dbf6e45fb 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -175,6 +175,14 @@ KernelGroup::KernelGroup( this->launch_msg.run = RUN_MSG_GO; } +CoreType KernelGroup::get_core_type() const { + if (this->erisc_id.has_value()) { + return CoreType::ETH; + } else { + return CoreType::WORKER; + } +}; + std::vector& Program::get_kernel_groups() { update_kernel_groups(); return kernel_groups_; diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index aabc368fe61..504b0ddd84b 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -46,6 +46,8 @@ struct KernelGroup { std::optional erisc_id, int last_cb_index, const CoreRangeSet &new_ranges); + + CoreType get_core_type() const; }; class Program { diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index fdb3593933e..13b720945b1 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -145,8 +145,7 @@ void write_launch_msg_to_core(chip_id_t chip, CoreCoord core, launch_msg_t *msg) msg->mode = DISPATCH_MODE_HOST; TT_ASSERT(sizeof(launch_msg_t) % sizeof(uint32_t) == 0); if (static_cast(msg->enable_erisc)) { - llrt::write_hex_vec_to_core(chip, core, {123}, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); - launch_erisc_app_fw_on_core(chip, core); + llrt::write_hex_vec_to_core(chip, core, {0x1}, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); } else { tt::Cluster::instance().write_core( (void *)msg, sizeof(launch_msg_t), tt_cxy_pair(chip, core), GET_MAILBOX_ADDRESS_HOST(launch)); @@ -287,7 +286,7 @@ namespace internal_ { static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreCoord &core, int run_state) { if (is_ethernet_core(core, chip_id)) { const auto &readback_vec = - read_hex_vec_from_core(chip_id, core, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG, sizeof(uint32_t)); + read_hex_vec_from_core(chip_id, core, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE, sizeof(uint32_t)); return (readback_vec[0] == 0); } else { std::function get_mailbox_is_done = [&](uint64_t run_mailbox_address) {