#0: Enable metal on galaxy.

#8305: add Galaxy cluster apis #8305: cleanup, add print #8450: Establish tunnels originating from an mmio device. Determine the remote chips as well as their order on the tunnel. #8452: add tests for tg pipeline #0: patch for tg workflows. #8450: Add tables for tunnel dispatch workers with build settings. Populate build settings for tunnel kernels. Launch FD2 kernels based on information in tunnel device dispatch worker map. Enable 4 devices per hugepage/channel #0: disable hanging/failing tests for Galaxy #0: skip using channel 3, 7 which use huge page channel 3. This (4th) huepage is not a full 1GB in size. 256 MB is taken up by syseng tools 4th huge page. #0: re-enable Galaxy sharded tests, reduce one test runtime for Galaxy #0: fix cluster init for Galaxy #8953: Fix hardcoding of queue sizes in tests. #8450: Fix compute grid selection for N150. N150 can be standalone system or part of a TG system. On TG compute grid for N150 is different than standalone N150. #0: Reduce prefetch q entries to account for Galaxy CQ size.
tenstorrent · Jun 3, 2024 · d79350e · d79350e
1 parent 354370a
commit d79350e
Show file tree

Hide file tree

Showing 13 changed files with 1,419 additions and 698 deletions.
diff --git a/.../unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/.../unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -94,11 +94,11 @@ void test_EnqueueWriteBuffer_and_EnqueueReadBuffer(Device *device, CommandQueue
     // Clear out command queue
     uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
     chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-    uint32_t cq_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel) / device->num_hw_cqs();
+    uint32_t cq_size = device->sysmem_manager().get_cq_size();
 
     std::vector<uint32_t> cq_zeros((cq_size - CQ_START) / sizeof(uint32_t), 0);
 
-    tt::Cluster::instance().write_sysmem(cq_zeros.data(), (cq_size - CQ_START), CQ_START, mmio_device_id, channel);
+    tt::Cluster::instance().write_sysmem(cq_zeros.data(), (cq_size - CQ_START), get_absolute_cq_offset(channel, 0, cq_size) + CQ_START, mmio_device_id, channel);
 
     for (const bool cq_write : {true, false}) {
         for (const bool cq_read : {true, false}) {
@@ -327,6 +327,7 @@ namespace dram_tests {
 TEST_F(CommandQueueSingleCardFixture, WriteOneTileToDramBank0) {
     TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
     for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
         local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config);
     }
 }
@@ -428,11 +429,9 @@ TEST_F(CommandQueueFixture, TestPageSizeTooLarge) {
 // Requires enqueue write buffer
 TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
     for (Device *device : this->devices_) {
+        tt::log_info("Running On Device {}", device->id());
         uint32_t page_size = 2048;
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-        uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
-        uint32_t command_issue_region_size = 805310400;
+        uint32_t command_issue_region_size = device->sysmem_manager().get_issue_queue_size(0);
 
         uint32_t max_command_size = command_issue_region_size - CQ_START;
         uint32_t buffer = 14240;
@@ -446,10 +445,9 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
 
 TEST_F(CommandQueueSingleCardFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
     for (Device *device : this->devices_) {
+        tt::log_info("Running On Device {}", device->id());
         uint32_t page_size = 2048;
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-        uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
+        uint32_t command_queue_size = device->sysmem_manager().get_cq_size();
         uint32_t num_pages = command_queue_size / page_size;
 
         TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};
@@ -465,10 +463,8 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
     uint32_t small_page_size = 2048;  // page size for second read
 
     for (Device *device : devices_) {
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-        uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
-        uint32_t command_completion_region_size = 268431360;
+        tt::log_info("Running On Device {}", device->id());
+        uint32_t command_completion_region_size = device->sysmem_manager().get_completion_queue_size(0);
 
         uint32_t first_buffer_size = tt::round_up(command_completion_region_size * 0.95, large_page_size);
 
@@ -504,10 +500,8 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
 TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) {
     // Using default 75-25 issue and completion queue split
     for (Device *device : devices_) {
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-        uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
-        uint32_t command_completion_region_size = 268431360;
+        tt::log_info("Running On Device {}", device->id());
+        uint32_t command_completion_region_size = device->sysmem_manager().get_completion_queue_size(0);
 
         uint32_t num_pages_buff_1 = 9;
         uint32_t page_size_buff_1 = 2048;
@@ -653,6 +647,7 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsBlocki
         .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16};
 
     for (Device *device : devices_) {
+        tt::log_info("Running on Device {}", device->id());
         EXPECT_TRUE(local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer<true>(
             device, device->command_queue(), config));
     }
@@ -672,34 +667,34 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsNonblo
 
 // TODO: Split this into separate tests
 TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
+    std::map<std::string, std::vector<std::array<uint32_t, 2>>> test_params;
+
     for (Device *device : devices_) {
-        for (const std::array<uint32_t, 2> cores :
-             {std::array<uint32_t, 2>{1, 1},
-              std::array<uint32_t, 2>{5, 1},
-              std::array<uint32_t, 2>{1, 5},
-              std::array<uint32_t, 2>{5, 3},
-              std::array<uint32_t, 2>{3, 5},
-              std::array<uint32_t, 2>{5, 5},
-              std::array<uint32_t, 2>{
-                  static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
-                  static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}) {
-            for (const std::array<uint32_t, 2> num_pages : {
-                     std::array<uint32_t, 2>{1, 1},
-                     std::array<uint32_t, 2>{2, 1},
-                     std::array<uint32_t, 2>{1, 2},
-                     std::array<uint32_t, 2>{2, 2},
-                     std::array<uint32_t, 2>{7, 11},
-                     std::array<uint32_t, 2>{3, 65},
-                     std::array<uint32_t, 2>{67, 4},
-                     std::array<uint32_t, 2>{3, 137},
-                 }) {
-                for (const std::array<uint32_t, 2> page_shape : {
-                         std::array<uint32_t, 2>{32, 32},
-                         std::array<uint32_t, 2>{1, 4},
-                         std::array<uint32_t, 2>{1, 120},
-                         std::array<uint32_t, 2>{1, 1024},
-                         std::array<uint32_t, 2>{1, 2048},
-                     }) {
+        if (tt::Cluster::instance().is_galaxy_cluster()) {
+            test_params = {
+                {"cores",
+                 {{1, 1},
+                  {static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
+                   static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}},
+                {"num_pages", {{3, 65}}},
+                {"page_shape", {{32, 32}}}};
+        } else {
+            test_params = {
+                {"cores",
+                 {{1, 1},
+                  {5, 1},
+                  {1, 5},
+                  {5, 3},
+                  {3, 5},
+                  {5, 5},
+                  {static_cast<uint32_t>(device->compute_with_storage_grid_size().x),
+                   static_cast<uint32_t>(device->compute_with_storage_grid_size().y)}}},
+                {"num_pages", {{1, 1}, {2, 1}, {1, 2}, {2, 2}, {7, 11}, {3, 65}, {67, 4}, {3, 137}}},
+                {"page_shape", {{32, 32}, {1, 4}, {1, 120}, {1, 1024}, {1, 2048}}}};
+        }
+        for (const std::array<uint32_t, 2> cores : test_params.at("cores")) {
+            for (const std::array<uint32_t, 2> num_pages : test_params.at("num_pages")) {
+                for (const std::array<uint32_t, 2> page_shape : test_params.at("page_shape")) {
                     for (const TensorMemoryLayout shard_strategy :
                          {TensorMemoryLayout::HEIGHT_SHARDED,
                           TensorMemoryLayout::WIDTH_SHARDED,
@@ -712,7 +707,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
                             config.num_iterations = num_iterations;
                             config.mem_config = shard_strategy;
                             config.page_shape = page_shape;
-                            tt::log_info(tt::LogTest, fmt::format("cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
+                            tt::log_info(tt::LogTest, fmt::format("Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
                             local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
                                 device, device->command_queue(), config, BufferType::L1, false);
                         }
@@ -800,7 +795,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeL1ReadWrites) {
                             config.num_iterations = num_iterations;
                             config.mem_config = shard_strategy;
                             config.page_shape = page_shape;
-                            tt::log_info(tt::LogTest, fmt::format("cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
+                            tt::log_info(tt::LogTest, fmt::format("Device: {} cores: [{},{}] num_pages: [{},{}] page_shape: [{},{}], shard_strategy: {}, num_iterations: {}", device->id(), cores[0],cores[1], num_pages[0],num_pages[1], page_shape[0],page_shape[1], magic_enum::enum_name(shard_strategy).data(), num_iterations).c_str());
                             local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(
                                 device, device->command_queue(), config, BufferType::L1, true);
                         }

diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp
@@ -119,6 +119,10 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id,
 
     auto compute_with_storage_start = desc_yaml["compute_with_storage_grid_range"]["start"];
     auto compute_with_storage_end = desc_yaml["compute_with_storage_grid_range"]["end"];
+    if (tt::Cluster::instance().is_galaxy_cluster() and product_name == "nebula_x1") {
+        compute_with_storage_start = desc_yaml["tg_compute_with_storage_grid_range"]["start"];
+        compute_with_storage_end = desc_yaml["tg_compute_with_storage_grid_range"]["end"];
+    }
     TT_ASSERT(compute_with_storage_start.IsSequence() and compute_with_storage_end.IsSequence());
     TT_ASSERT(compute_with_storage_end[0].as<size_t>() >= compute_with_storage_start[0].as<size_t>());
     TT_ASSERT(compute_with_storage_end[1].as<size_t>() >= compute_with_storage_start[1].as<size_t>());
@@ -136,7 +140,11 @@ inline const core_descriptor_t &get_core_descriptor_config(chip_id_t device_id,
     }
 
     std::vector<RelativeCoreCoord> dispatch_cores;
-    for (const auto& core_node : desc_yaml["dispatch_cores"]) {
+    auto dispatch_cores_string = "dispatch_cores";
+    if (tt::Cluster::instance().is_galaxy_cluster() and product_name == "nebula_x1") {
+        dispatch_cores_string = "tg_dispatch_cores";
+    }
+    for (const auto& core_node : desc_yaml[dispatch_cores_string]) {
         RelativeCoreCoord coord = {};
         if (core_node.IsSequence()) {
             // Logical coord

diff --git a/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml b/tt_metal/core_descriptors/wormhole_b0_80_arch.yaml
@@ -49,12 +49,21 @@ nebula_x1:
       start: [0, 0]
       end: [7, 7]
 
+    tg_compute_with_storage_grid_range: # Logical only start and end [x, y]
+      start: [0, 0]
+      end: [7, 3]
+
     storage_cores:
       []
 
     dispatch_cores:
       [[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1]]
 
+    tg_dispatch_cores:
+      [[0, -1], [1, -1], [2, -1], [3, -1], [4, -1], [5, -1], [6, -1], [7, -1],
+       [0, -2], [1, -2], [2, -2], [3, -2], [4, -2], [5, -2], [6, -2], [7, -2],
+       [0, -3], [1, -3], [2, -3], [3, -3], [4, -3], [5, -3], [6, -3], [7, -3]]
+
     dispatch_core_type:
       "tensix"