From ac71e209bc411b0342b5dead21f11fb675fcf22c Mon Sep 17 00:00:00 2001 From: Vincent Tang Date: Thu, 15 Aug 2024 22:28:11 +0000 Subject: [PATCH] Fix and re-enable most BH unit tests --- device/tt_silicon_driver.cpp | 2 +- tests/blackhole/CMakeLists.txt | 5 +- tests/blackhole/test_silicon_driver_bh.cpp | 679 ++++++++++----------- 3 files changed, 331 insertions(+), 355 deletions(-) diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp index d6602a13..0cfdf027 100644 --- a/device/tt_silicon_driver.cpp +++ b/device/tt_silicon_driver.cpp @@ -4294,7 +4294,7 @@ void tt_SiliconDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t else if (arch_name == tt::ARCH::BLACKHOLE) { auto architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch_name)); if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) { - log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole."); + log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole."); if(cols_to_exclude.find(0) == cols_to_exclude.end()) { // When broadcast includes column zero do not exclude anything std::set unsafe_rows = {}; diff --git a/tests/blackhole/CMakeLists.txt b/tests/blackhole/CMakeLists.txt index 6a9a6eb0..4ff86013 100644 --- a/tests/blackhole/CMakeLists.txt +++ b/tests/blackhole/CMakeLists.txt @@ -5,6 +5,9 @@ set(UNIT_TESTS_BH_SRCS add_executable(unit_tests_blackhole ${UNIT_TESTS_BH_SRCS}) target_link_libraries(unit_tests_blackhole PRIVATE test_common) -set_target_properties(unit_tests_blackhole PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/umd/blackhole) +set_target_properties(unit_tests_blackhole PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/umd/blackhole + OUTPUT_NAME unit_tests +) add_custom_target(umd_unit_tests DEPENDS unit_tests_blackhole) diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp index 3feec7b7..d6c938aa 100644 --- a/tests/blackhole/test_silicon_driver_bh.cpp +++ b/tests/blackhole/test_silicon_driver_bh.cpp @@ -68,14 +68,23 @@ std::int32_t get_static_tlb_index(tt_xy_pair target) { } } +std::set get_target_devices() { + std::set target_devices; + std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(test_utils::GetAbsPath("blackhole_1chip_cluster.yaml")); + for (int i = 0; i < cluster_desc_uniq->get_number_of_chips(); i++) { + target_devices.insert(i); + } + return target_devices; +} + TEST(SiliconDriverBH, CreateDestroy) { - std::set target_devices = {0}; + std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; std::unordered_map dynamic_tlb_config = {}; // Don't set any dynamic TLBs in this test tt_device_params default_params; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting for(int i = 0; i < 50; i++) { - tt_SiliconDevice device = tt_SiliconDevice("./tests/soc_descs/blackhole_140_arch_no_eth.yaml", "./blackhole_1chip_cluster.yaml", target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, false); + tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, false); set_params_for_remote_txn(device); device.start_device(default_params); device.deassert_risc_reset(); @@ -202,67 +211,67 @@ TEST(SiliconDriverBH, CreateDestroy) { // device.close_device(); // } -// TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { -// auto get_static_tlb_index_callback = [] (tt_xy_pair target) { -// return get_static_tlb_index(target); -// }; +TEST(SiliconDriverBH, UnalignedStaticTLB_RW) { + auto get_static_tlb_index_callback = [] (tt_xy_pair target) { + return get_static_tlb_index(target); + }; -// std::set target_devices = {0, 1}; + std::set target_devices = get_target_devices(); -// std::unordered_map dynamic_tlb_config = {}; // Don't set any dynamic TLBs in this test -// dynamic_tlb_config["REG_TLB"] = 184; -// uint32_t num_host_mem_ch_per_mmio_device = 1; + std::unordered_map dynamic_tlb_config = {}; // Don't set any dynamic TLBs in this test + dynamic_tlb_config["REG_TLB"] = 184; + uint32_t num_host_mem_ch_per_mmio_device = 1; -// tt_SiliconDevice device = tt_SiliconDevice("./tests/soc_descs/wormhole_b0_8x10.yaml", test_utils::GetClusterDescYAML(), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); -// set_params_for_remote_txn(device); -// auto mmio_devices = device.get_target_mmio_device_ids(); + tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); + set_params_for_remote_txn(device); + auto mmio_devices = device.get_target_mmio_device_ids(); -// for(int i = 0; i < target_devices.size(); i++) { -// // Iterate over MMIO devices and only setup static TLBs for worker cores -// if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { -// auto& sdesc = device.get_virtual_soc_descriptors().at(i); -// for(auto& core : sdesc.workers) { -// // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. -// device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); -// } -// } -// } + for(int i = 0; i < target_devices.size(); i++) { + // Iterate over MMIO devices and only setup static TLBs for worker cores + if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + auto& sdesc = device.get_virtual_soc_descriptors().at(i); + for(auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + } + } + } -// device.setup_core_to_tlb_map(get_static_tlb_index_callback); + device.setup_core_to_tlb_map(get_static_tlb_index_callback); -// tt_device_params default_params; -// device.start_device(default_params); -// device.deassert_risc_reset(); + tt_device_params default_params; + device.start_device(default_params); + device.deassert_risc_reset(); -// std::vector unaligned_sizes = {3, 14, 21, 255, 362, 430, 1022, 1023, 1025}; -// for(int i = 0; i < 2; i++) { -// for(const auto& size : unaligned_sizes) { -// std::vector write_vec(size, 0); -// for(int i = 0; i < size; i++){ -// write_vec[i] = size + i; -// } -// std::vector readback_vec(size, 0); -// std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; -// for(int loop = 0; loop < 50; loop++){ -// for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { -// device.write_to_device(write_vec.data(), size, tt_cxy_pair(i, core), address, ""); -// device.wait_for_non_mmio_flush(); -// device.read_from_device(readback_vec.data(), tt_cxy_pair(i, core), address, size, ""); -// ASSERT_EQ(readback_vec, write_vec); -// readback_vec = std::vector(size, 0); -// device.write_to_sysmem(write_vec.data(), size, 0, 0, 0); -// device.read_from_sysmem(readback_vec.data(), 0, 0, size, 0); -// ASSERT_EQ(readback_vec, write_vec); -// readback_vec = std::vector(size, 0); -// device.wait_for_non_mmio_flush(); -// } -// address += 0x20; -// } + std::vector unaligned_sizes = {3, 14, 21, 255, 362, 430, 1022, 1023, 1025}; + for(int i = 0; i < target_devices.size(); i++) { + for(const auto& size : unaligned_sizes) { + std::vector write_vec(size, 0); + for(int i = 0; i < size; i++){ + write_vec[i] = size + i; + } + std::vector readback_vec(size, 0); + std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; + for(int loop = 0; loop < 50; loop++){ + for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device(write_vec.data(), size, tt_cxy_pair(i, core), address, ""); + device.wait_for_non_mmio_flush(); + device.read_from_device(readback_vec.data(), tt_cxy_pair(i, core), address, size, ""); + ASSERT_EQ(readback_vec, write_vec); + readback_vec = std::vector(size, 0); + device.write_to_sysmem(write_vec.data(), size, 0, 0, 0); + device.read_from_sysmem(readback_vec.data(), 0, 0, size, 0); + ASSERT_EQ(readback_vec, write_vec); + readback_vec = std::vector(size, 0); + device.wait_for_non_mmio_flush(); + } + address += 0x20; + } -// } -// } -// device.close_device(); -// } + } + } + device.close_device(); +} TEST(SiliconDriverBH, StaticTLB_RW) { @@ -270,20 +279,12 @@ TEST(SiliconDriverBH, StaticTLB_RW) { return get_static_tlb_index(target); }; - // std::set target_devices = {0, 1}; - std::set target_devices = {0}; - - { - std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml("./blackhole_1chip_cluster.yaml"); - if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { - GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; - } - } + std::set target_devices = get_target_devices(); std::unordered_map dynamic_tlb_config = {}; // Don't set any dynamic TLBs in this test uint32_t num_host_mem_ch_per_mmio_device = 1; - tt_SiliconDevice device = tt_SiliconDevice("./tests/soc_descs/blackhole_140_arch_no_eth.yaml", "./blackhole_1chip_cluster.yaml", target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); + tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -331,19 +332,12 @@ TEST(SiliconDriverBH, StaticTLB_RW) { TEST(SiliconDriverBH, DynamicTLB_RW) { // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction - std::set target_devices = {0}; - - { - std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml("./blackhole_1chip_cluster.yaml"); - if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { - GTEST_SKIP() << "SiliconDriverBH.Harvesting skipped because it can only be run on a two chip nebula system"; - } - } + std::set target_devices = get_target_devices(); std::unordered_map dynamic_tlb_config = {}; uint32_t num_host_mem_ch_per_mmio_device = 1; dynamic_tlb_config.insert({"DYNAMIC_TLB_BASE_INDEX", 190}); // Use this for all reads and writes to worker cores - tt_SiliconDevice device = tt_SiliconDevice("./tests/soc_descs/blackhole_140_arch_no_eth.yaml", "./blackhole_1chip_cluster.yaml", target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); + tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); set_params_for_remote_txn(device); @@ -400,311 +394,290 @@ TEST(SiliconDriverBH, DynamicTLB_RW) { device.close_device(); } -// TEST(SiliconDriverWH, MultiThreadedDevice) { -// // Have 2 threads read and write from a single device concurrently -// // All transactions go through a single Dynamic TLB. We want to make sure this is thread/process safe +TEST(SiliconDriverBH, MultiThreadedDevice) { + // Have 2 threads read and write from a single device concurrently + // All transactions go through a single Dynamic TLB. We want to make sure this is thread/process safe -// std::set target_devices = {0}; + std::set target_devices = get_target_devices(); -// { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(test_utils::GetClusterDescYAML()); -// if (cluster_desc_uniq->get_number_of_chips() > 2) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a one or two chip nebula system"; -// } -// } - -// std::unordered_map dynamic_tlb_config = {}; -// uint32_t num_host_mem_ch_per_mmio_device = 1; -// dynamic_tlb_config.insert({"SMALL_READ_WRITE_TLB", 157}); // Use this for all reads and writes to worker cores -// tt_SiliconDevice device = tt_SiliconDevice("./tests/soc_descs/wormhole_b0_8x10.yaml", test_utils::GetClusterDescYAML(), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); + std::unordered_map dynamic_tlb_config = {}; + uint32_t num_host_mem_ch_per_mmio_device = 1; + dynamic_tlb_config.insert({"SMALL_READ_WRITE_TLB", 157}); // Use this for all reads and writes to worker cores + tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); -// set_params_for_remote_txn(device); + set_params_for_remote_txn(device); -// tt_device_params default_params; -// device.start_device(default_params); -// device.deassert_risc_reset(); + tt_device_params default_params; + device.start_device(default_params); + device.deassert_risc_reset(); -// std::thread th1 = std::thread([&] { -// std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; -// std::vector readback_vec = {}; -// std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; -// for(int loop = 0; loop < 100; loop++) { -// for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { -// device.write_to_device(vector_to_write, tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); -// device.read_from_device(readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; -// readback_vec = {}; -// } -// address += 0x20; -// } -// }); - -// std::thread th2 = std::thread([&] { -// std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; -// std::vector readback_vec = {}; -// std::uint32_t address = 0x30000000; -// for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { -// for(int loop = 0; loop < 100; loop++) { -// for(auto& core : core_ls) { -// device.write_to_device(vector_to_write, tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); -// device.read_from_device(readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; -// readback_vec = {}; -// } -// address += 0x20; -// } -// } -// }); + std::thread th1 = std::thread([&] { + std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector readback_vec = {}; + std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; + for(int loop = 0; loop < 100; loop++) { + for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.write_to_device(vector_to_write, tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); + device.read_from_device(readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + readback_vec = {}; + } + address += 0x20; + } + }); + + std::thread th2 = std::thread([&] { + std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector readback_vec = {}; + std::uint32_t address = 0x30000000; + for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { + for(int loop = 0; loop < 100; loop++) { + for(auto& core : core_ls) { + device.write_to_device(vector_to_write, tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); + device.read_from_device(readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + readback_vec = {}; + } + address += 0x20; + } + } + }); -// th1.join(); -// th2.join(); -// device.close_device(); -// } + th1.join(); + th2.join(); + device.close_device(); +} -// TEST(SiliconDriverWH, MultiThreadedMemBar) { -// // Have 2 threads read and write from a single device concurrently -// // All (fairly large) transactions go through a static TLB. -// // We want to make sure the memory barrier is thread/process safe. +TEST(SiliconDriverBH, MultiThreadedMemBar) { + // Have 2 threads read and write from a single device concurrently + // All (fairly large) transactions go through a static TLB. + // We want to make sure the memory barrier is thread/process safe. -// // Memory barrier flags get sent to address 0 for all channels in this test -// auto get_static_tlb_index_callback = [] (tt_xy_pair target) { -// return get_static_tlb_index(target); -// }; + // Memory barrier flags get sent to address 0 for all channels in this test + auto get_static_tlb_index_callback = [] (tt_xy_pair target) { + return get_static_tlb_index(target); + }; -// std::set target_devices = {0}; -// uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; -// std::unordered_map dynamic_tlb_config = {}; -// dynamic_tlb_config.insert({"SMALL_READ_WRITE_TLB", 157}); // Use this for reading back membar values -// uint32_t num_host_mem_ch_per_mmio_device = 1; + std::set target_devices = get_target_devices(); + uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; + std::unordered_map dynamic_tlb_config = {}; + dynamic_tlb_config.insert({"SMALL_READ_WRITE_TLB", 157}); // Use this for reading back membar values + uint32_t num_host_mem_ch_per_mmio_device = 1; -// tt_SiliconDevice device = tt_SiliconDevice("./tests/soc_descs/wormhole_b0_8x10.yaml", test_utils::GetClusterDescYAML(), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); -// set_params_for_remote_txn(device); -// for(int i = 0; i < target_devices.size(); i++) { -// // Iterate over devices and only setup static TLBs for functional worker cores -// auto& sdesc = device.get_virtual_soc_descriptors().at(i); -// for(auto& core : sdesc.workers) { -// // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. -// device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); -// } -// } -// device.setup_core_to_tlb_map(get_static_tlb_index_callback); + tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); + set_params_for_remote_txn(device); + for(int i = 0; i < target_devices.size(); i++) { + // Iterate over devices and only setup static TLBs for functional worker cores + auto& sdesc = device.get_virtual_soc_descriptors().at(i); + for(auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); + } + } + device.setup_core_to_tlb_map(get_static_tlb_index_callback); -// tt_device_params default_params; -// device.start_device(default_params); -// device.deassert_risc_reset(); + tt_device_params default_params; + device.start_device(default_params); + device.deassert_risc_reset(); -// std::vector readback_membar_vec = {}; -// for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { -// device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers -// readback_membar_vec = {}; -// } + std::vector readback_membar_vec = {}; + for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + readback_membar_vec = {}; + } -// for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { -// auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0); -// device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM -// readback_membar_vec = {}; -// } + for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { + auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0); + device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM + readback_membar_vec = {}; + } -// for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { -// device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores -// readback_membar_vec = {}; -// } + for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores + readback_membar_vec = {}; + } -// // Launch 2 thread accessing different locations of L1 and using memory barrier between write and read -// // Ensure now RAW race and membars are thread safe -// std::vector vec1(2560); -// std::vector vec2(2560); -// std::vector zeros(2560, 0); + // Launch 2 thread accessing different locations of L1 and using memory barrier between write and read + // Ensure now RAW race and membars are thread safe + std::vector vec1(2560); + std::vector vec2(2560); + std::vector zeros(2560, 0); -// for(int i = 0; i < vec1.size(); i++) { -// vec1.at(i) = i; -// } -// for(int i = 0; i < vec2.size(); i++) { -// vec2.at(i) = vec1.size() + i; -// } -// std::thread th1 = std::thread([&] { -// std::uint32_t address = base_addr; -// for(int loop = 0; loop < 50; loop++) { -// for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { -// std::vector readback_vec = {}; -// device.write_to_device(vec1, tt_cxy_pair(0, core), address, ""); -// device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); -// device.read_from_device(readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), ""); -// ASSERT_EQ(readback_vec, vec1); -// device.write_to_device(zeros, tt_cxy_pair(0, core), address, ""); -// readback_vec = {}; -// } + for(int i = 0; i < vec1.size(); i++) { + vec1.at(i) = i; + } + for(int i = 0; i < vec2.size(); i++) { + vec2.at(i) = vec1.size() + i; + } + std::thread th1 = std::thread([&] { + std::uint32_t address = base_addr; + for(int loop = 0; loop < 50; loop++) { + for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + std::vector readback_vec = {}; + device.write_to_device(vec1, tt_cxy_pair(0, core), address, ""); + device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); + device.read_from_device(readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), ""); + ASSERT_EQ(readback_vec, vec1); + device.write_to_device(zeros, tt_cxy_pair(0, core), address, ""); + readback_vec = {}; + } -// } -// }); - -// std::thread th2 = std::thread([&] { -// std::uint32_t address = base_addr + vec1.size() * 4; -// for(int loop = 0; loop < 50; loop++) { -// for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { -// std::vector readback_vec = {}; -// device.write_to_device(vec2, tt_cxy_pair(0, core), address, ""); -// device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); -// device.read_from_device(readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), ""); -// ASSERT_EQ(readback_vec, vec2); -// device.write_to_device(zeros, tt_cxy_pair(0, core), address, "") ; -// readback_vec = {}; -// } -// } -// }); - -// th1.join(); -// th2.join(); + } + }); + + std::thread th2 = std::thread([&] { + std::uint32_t address = base_addr + vec1.size() * 4; + for(int loop = 0; loop < 50; loop++) { + for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + std::vector readback_vec = {}; + device.write_to_device(vec2, tt_cxy_pair(0, core), address, ""); + device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); + device.read_from_device(readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), ""); + ASSERT_EQ(readback_vec, vec2); + device.write_to_device(zeros, tt_cxy_pair(0, core), address, "") ; + readback_vec = {}; + } + } + }); -// for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { -// device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers -// readback_membar_vec = {}; -// } + th1.join(); + th2.join(); -// for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { -// device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for ethernet cores -// readback_membar_vec = {}; -// } -// device.close_device(); -// } + for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers + readback_membar_vec = {}; + } + for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + device.read_from_device(readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for ethernet cores + readback_membar_vec = {}; + } + device.close_device(); +} -// TEST(SiliconDriverWH, BroadcastWrite) { -// // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly -// std::set target_devices = {0, 1}; +TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole .. wait_for_non_mmio_flush() is not working as expected? + // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly + std::set target_devices = get_target_devices(); -// { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(test_utils::GetClusterDescYAML()); -// if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; -// } -// } - -// std::unordered_map dynamic_tlb_config = {}; // Don't set any dynamic TLBs in this test -// uint32_t num_host_mem_ch_per_mmio_device = 1; + std::unordered_map dynamic_tlb_config = {}; // Don't set any dynamic TLBs in this test + uint32_t num_host_mem_ch_per_mmio_device = 1; -// tt_SiliconDevice device = tt_SiliconDevice("./tests/soc_descs/wormhole_b0_8x10.yaml", test_utils::GetClusterDescYAML(), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); -// set_params_for_remote_txn(device); -// auto mmio_devices = device.get_target_mmio_device_ids(); - -// tt_device_params default_params; -// device.start_device(default_params); -// device.deassert_risc_reset(); -// std::vector broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; -// uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; -// std::set rows_to_exclude = {0, 6}; -// std::set cols_to_exclude = {0, 5}; -// std::set rows_to_exclude_for_dram_broadcast = {}; -// std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - -// for(const auto& size : broadcast_sizes) { -// std::vector vector_to_write(size); -// std::vector zeros(size); -// std::vector readback_vec = {}; -// for(int i = 0; i < size; i++) { -// vector_to_write[i] = i; -// zeros[i] = 0; -// } -// // Broadcast to Tensix -// device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); -// // Broadcast to DRAM -// device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); -// device.wait_for_non_mmio_flush(); - -// for(const auto i : target_devices) { -// for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { -// if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; -// device.read_from_device(readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); -// ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; -// device.write_to_device(zeros, tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data -// readback_vec = {}; -// } -// for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { -// const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); -// device.read_from_device(readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); -// ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; -// device.write_to_device(zeros, tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data -// readback_vec = {}; -// } -// } -// // Wait for data to be cleared before writing next block -// device.wait_for_non_mmio_flush(); -// } -// device.close_device(); -// } + tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); + set_params_for_remote_txn(device); + auto mmio_devices = device.get_target_mmio_device_ids(); -// TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { -// // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly -// std::set target_devices = {0, 1}; + tt_device_params default_params; + device.start_device(default_params); + device.deassert_risc_reset(); + std::vector broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; + uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; + std::set rows_to_exclude = {0, 6}; + std::set cols_to_exclude = {0, 5}; + std::set rows_to_exclude_for_dram_broadcast = {}; + std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; + + for(const auto& size : broadcast_sizes) { + std::vector vector_to_write(size); + std::vector zeros(size); + std::vector readback_vec = {}; + for(int i = 0; i < size; i++) { + vector_to_write[i] = i; + zeros[i] = 0; + } + // Broadcast to Tensix + device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); + device.wait_for_non_mmio_flush(); // flush here so we don't simultaneously broadcast to DRAM? + // Broadcast to DRAM + device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.wait_for_non_mmio_flush(); + + for(const auto i : target_devices) { + for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; + device.read_from_device(readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; + device.write_to_device(zeros, tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + readback_vec = {}; + } + for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); + device.read_from_device(readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; + device.write_to_device(zeros, tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + readback_vec = {}; + } + } + // Wait for data to be cleared before writing next block + device.wait_for_non_mmio_flush(); + } + device.close_device(); +} -// { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(test_utils::GetClusterDescYAML()); -// if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; -// } -// } +TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as above.. + // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly + std::set target_devices = get_target_devices(); -// std::unordered_map dynamic_tlb_config = {}; // Don't set any dynamic TLBs in this test -// uint32_t num_host_mem_ch_per_mmio_device = 1; + std::unordered_map dynamic_tlb_config = {}; // Don't set any dynamic TLBs in this test + uint32_t num_host_mem_ch_per_mmio_device = 1; -// tt_SiliconDevice device = tt_SiliconDevice("./tests/soc_descs/wormhole_b0_8x10.yaml", test_utils::GetClusterDescYAML(), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); -// set_params_for_remote_txn(device); -// auto mmio_devices = device.get_target_mmio_device_ids(); + tt_SiliconDevice device = tt_SiliconDevice(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"), target_devices, num_host_mem_ch_per_mmio_device, dynamic_tlb_config, false, true, true); + set_params_for_remote_txn(device); + auto mmio_devices = device.get_target_mmio_device_ids(); -// tt_device_params default_params; -// device.start_device(default_params); -// auto eth_version = device.get_ethernet_fw_version(); -// bool virtual_bcast_supported = (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; -// if (!virtual_bcast_supported) { -// device.close_device(); -// GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled"; -// } + tt_device_params default_params; + device.start_device(default_params); + auto eth_version = device.get_ethernet_fw_version(); + bool virtual_bcast_supported = (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; + if (!virtual_bcast_supported) { + device.close_device(); + GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled"; + } -// device.deassert_risc_reset(); -// std::vector broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; -// uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; -// std::set rows_to_exclude = {0, 3, 5, 6, 8, 9}; -// std::set cols_to_exclude = {0, 5}; -// std::set rows_to_exclude_for_dram_broadcast = {}; -// std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - -// for(const auto& size : broadcast_sizes) { -// std::vector vector_to_write(size); -// std::vector zeros(size); -// std::vector readback_vec = {}; -// for(int i = 0; i < size; i++) { -// vector_to_write[i] = i; -// zeros[i] = 0; -// } -// // Broadcast to Tensix -// device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); -// // Broadcast to DRAM -// device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); -// device.wait_for_non_mmio_flush(); - -// for(const auto i : target_devices) { -// for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { -// if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; -// device.read_from_device(readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); -// ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; -// device.write_to_device(zeros, tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data -// readback_vec = {}; -// } -// for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { -// const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); -// device.read_from_device(readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); -// ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; -// device.write_to_device(zeros, tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data -// readback_vec = {}; -// } -// } -// // Wait for data to be cleared before writing next block -// device.wait_for_non_mmio_flush(); -// } -// device.close_device(); -// } + device.deassert_risc_reset(); + std::vector broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; + uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; + std::set rows_to_exclude = {0, 3, 5, 6, 8, 9}; + std::set cols_to_exclude = {0, 5}; + std::set rows_to_exclude_for_dram_broadcast = {}; + std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; + + for(const auto& size : broadcast_sizes) { + std::vector vector_to_write(size); + std::vector zeros(size); + std::vector readback_vec = {}; + for(int i = 0; i < size; i++) { + vector_to_write[i] = i; + zeros[i] = 0; + } + // Broadcast to Tensix + device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); + // Broadcast to DRAM + device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.wait_for_non_mmio_flush(); + + for(const auto i : target_devices) { + for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; + device.read_from_device(readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; + device.write_to_device(zeros, tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + readback_vec = {}; + } + for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); + device.read_from_device(readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; + device.write_to_device(zeros, tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + readback_vec = {}; + } + } + // Wait for data to be cleared before writing next block + device.wait_for_non_mmio_flush(); + } + device.close_device(); +}