GraphProcessor extension to expose page size and num cores (#14903)

This change is part of larger change to support compiler ### Problem description For L1 allocated buffers, using graph processor, we want to be able to know what is per core L1 usage. ### What's changed Addded GraphProcessor functionality for buffer_allocation/buffer_deallocation nodes to capture page size and number of cores (for sharded buffer allocations) Extend graph utilities with * extract_l1_output_buffer_allocation_size_per_core * extract_l1_buffer_allocation_peak_size_per_core Extended graph_test_add to test this functionality. Also heavy tested locally with different ops and options. ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] New/Existing tests provide coverage for changes
tenstorrent · Nov 9, 2024 · 37480d8 · 37480d8
1 parent 03998d5
commit 37480d8
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 2 deletions.
diff --git a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
@@ -34,6 +34,8 @@ struct AddOpGraphTestParam {
     std::vector<std::string> expected_calltrace;
     uint32_t expected_peak_L1_memory_usage = 0;
     uint32_t expected_intermediate_tensors_count = 0;
+    uint32_t expected_l1_output_per_core = 0;
+    uint32_t expected_l1_peak_per_core = 0;
     std::vector<graph::TensorInfo> expected_output_info;
 };
 
@@ -70,6 +72,17 @@ TEST_P(AddOpGraphTestFixture, AddGraphTrace) {
             EXPECT_EQ(output_tensors_count, 1);
         }
 
+        // per core buffer allocation size
+        {
+            auto compute_with_storage_grid_size = this->getDevice().compute_with_storage_grid_size();
+            size_t interleaved_storage_cores = compute_with_storage_grid_size.x * compute_with_storage_grid_size.y;
+
+            auto l1_output_per_core = graph::extract_l1_output_buffer_allocation_size_per_core(json_trace, interleaved_storage_cores);
+            EXPECT_EQ(l1_output_per_core, params.expected_l1_output_per_core);
+            auto l1_peak_per_core = graph::extract_l1_buffer_allocation_peak_size_per_core(json_trace, interleaved_storage_cores);
+            EXPECT_EQ(l1_peak_per_core, params.expected_l1_peak_per_core);
+        }
+
         // Query calls
         {
             auto peak_L1_memory_usage = graph::query_peak_L1_memory_usage(call);
@@ -115,6 +128,8 @@ INSTANTIATE_TEST_SUITE_P(
                 .expected_calltrace = { "ttnn::add", "ttnn::prim::binary", "BinaryDeviceOperation", "tt::tt_metal::create_device_tensor" },
                 .expected_peak_L1_memory_usage = 30720,
                 .expected_intermediate_tensors_count = 0,
+                .expected_l1_output_per_core = 2048,
+                .expected_l1_peak_per_core = 2048,
                 .expected_output_info = {
                     graph::TensorInfo{
                         .shape = ttnn::Shape(tt::tt_metal::Array4D{1, 3, 32, 32}),
@@ -129,6 +144,8 @@ INSTANTIATE_TEST_SUITE_P(
                 .expected_calltrace = { "ttnn::add", "ttnn::repeat", "ttnn::prim::old_infra_device_operation", "RepeatDeviceOperation", "tt::tt_metal::create_device_tensor", "ttnn::prim::binary", "BinaryDeviceOperation", "tt::tt_metal::create_device_tensor"},
                 .expected_peak_L1_memory_usage = 92160,
                 .expected_intermediate_tensors_count = 0,
+                .expected_l1_output_per_core = 2048,
+                .expected_l1_peak_per_core = 2*2048,
                 .expected_output_info = {
                     graph::TensorInfo{
                         .shape = ttnn::Shape(tt::tt_metal::Array4D{4, 3, 32, 32}),

diff --git a/ttnn/cpp/ttnn/graph/graph_consts.hpp b/ttnn/cpp/ttnn/graph/graph_consts.hpp
@@ -19,6 +19,8 @@ namespace ttnn::graph {
     constexpr auto kSize = "size";
     constexpr auto kLayout= "layout";
     constexpr auto kShape = "shape";
+    constexpr auto kNumCores = "num_cores";
+    constexpr auto kPageSize = "page_size";
 
     // node names
     constexpr auto kNodeBuffer = "buffer";

diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp
@@ -102,7 +102,9 @@ void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) {
             {kSize, std::to_string(buffer->size())},
             {kAddress, std::to_string(buffer->address())},
             {kType, buffer->is_dram() ? "DRAM" : "L1"},
-            {kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())}
+            {kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())},
+            {kPageSize, std::to_string(buffer->page_size())},
+            {kNumCores, std::to_string(buffer->num_cores().value_or(0))} // use 0 for interleaved
     };
     {
         graph.push_back(Vertex{
@@ -122,7 +124,9 @@ void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) {
     std::unordered_map<std::string, std::string> params = {
             {kSize, std::to_string(buffer->size())},
             {kType, buffer->is_dram() ? "DRAM" : "L1"},
-            {kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())}
+            {kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())},
+            {kPageSize, std::to_string(buffer->page_size())},
+            {kNumCores, std::to_string(buffer->num_cores().value_or(0))} // use 0 for interleaved
     };
     {
         graph.push_back(Vertex{

diff --git a/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp b/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp
@@ -214,4 +214,129 @@ std::vector<TensorInfo> extract_output_info(const nlohmann::json& trace) {
 }
 
 
+namespace detail {
+    // This function computes the worst-case memory allocation per core for a given total size, page size, and number of cores.
+    size_t worst_case_per_core_allocation(size_t total_size, size_t page_size, size_t num_of_cores) {
+        size_t pages = std::ceil(float(total_size) / page_size);
+        size_t pages_per_core = std::ceil(float(pages) / num_of_cores);
+        return pages_per_core * page_size;
+    }
+}
+
+// This function returns the worst-case memory allocation per core for the output L1 buffer. 0 for DRAM buffers.
+uint32_t extract_l1_output_buffer_allocation_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores) {
+    // we are lookin for buffer_allocate that is connected to a buffer,
+    // that is connected to a same tensor as the function_end connected to capture_end
+    // buffer_allocate -> buffer -> tensor <- function_end -> capture_end
+
+    // Find the 'capture_end' node
+    const auto& capture_end_node =
+        std::find_if(trace.rbegin(), trace.rend(), [](const auto& v) { return v.at(kNodeType) == kNodeCaptureEnd; });
+
+    if (capture_end_node == trace.rend()) {
+        throw std::runtime_error("No capture_end node found in the trace");
+    }
+
+    // helper function to find a node of a specific type that points to a given node
+    auto find_a_first_node_of_a_type_pointing_to_me([&](const auto& trace, const char* node_type, const auto& my_node) {
+        return std::find_if(trace.begin(), trace.end(), [&](const auto& v) {
+            // check if v.at(kNodeType) starts with node_type because buffers and tensors have suffixes \"(counter)\"
+            std::string v_node_type = v.at(kNodeType).dump();
+            v_node_type.erase(std::remove(v_node_type.begin(), v_node_type.end(), '"'), v_node_type.end());
+
+            return (v_node_type.starts_with(node_type)) &&
+                   (std::find(v.at(kConnections).begin(), v.at(kConnections).end(), my_node.at(kCounter)) !=
+                    v.at(kConnections).end());
+        });
+    });
+
+    // helper function to find a node by counter
+    auto find_node_by_counter([&](const auto& trace, int counter) {
+        return std::find_if(trace.begin(), trace.end(), [&](const auto& v) { return v.at(kCounter) == counter; });
+    });
+
+    const auto& last_function_end_node =
+        find_a_first_node_of_a_type_pointing_to_me(trace, kNodeFunctionEnd, *capture_end_node);
+
+    if (last_function_end_node == trace.end()) {
+        throw std::runtime_error("No function_end node connected to capture_end found in the trace");
+    }
+
+    const auto& output_tensor_node = find_node_by_counter(trace, last_function_end_node->at(kConnections).at(0).get<int>());
+    if (output_tensor_node == trace.end()) {
+        throw std::runtime_error("No tensor node connected to function_end found in the trace");
+    }
+
+    const auto& output_buffer_node =
+        find_a_first_node_of_a_type_pointing_to_me(trace, kNodeBuffer, *output_tensor_node);
+    if (output_buffer_node == trace.end()) {
+        throw std::runtime_error("No buffer node connected to tensor found in the trace");
+    }
+
+    const auto& output_buffer_allocate_node =
+        find_a_first_node_of_a_type_pointing_to_me(trace, kNodeBufferAllocate, *output_buffer_node);
+    if (output_buffer_allocate_node == trace.end()) {
+        throw std::runtime_error("No buffer_allocate node connected to buffer found in the trace");
+    }
+
+
+    uint32_t output_buffer_allocate_total_size = std::stoi(output_buffer_allocate_node->at(kParams).at(kSize).get<std::string>());
+
+    // skip dram buffer allocation checks
+    if (output_buffer_allocate_node->at(kParams).at(kType) == "DRAM")
+    {
+        return 0;
+    }
+
+    uint32_t page_size = std::stoi(output_buffer_allocate_node->at(kParams).at(kPageSize).get<std::string>());
+    uint32_t num_of_cores = std::stoi(output_buffer_allocate_node->at(kParams).at(kNumCores).get<std::string>());
+    if (num_of_cores == 0)
+    {
+        num_of_cores = interleaved_storage_cores;
+    }
+
+    return detail::worst_case_per_core_allocation(output_buffer_allocate_total_size, page_size, num_of_cores);
+}
+
+// This function returns the worst-case memory allocation per core for the peak L1 usage. Ignores DRAM buffers.
+uint32_t extract_l1_buffer_allocation_peak_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores) {
+
+    uint32_t current_size_per_core = 0;
+    uint32_t peak_size_per_core = 0;
+
+    for (const auto& node : trace)
+    {
+        // process only buffer allocation and deallocation nodes
+        if (node.at(kNodeType) != kNodeBufferAllocate && node.at(kNodeType) != kNodeBufferDeallocate)
+        {
+            continue;
+        }
+
+        // skip dram buffer allocation/deallocation
+        if (node.at(kParams).at(kType) == "DRAM")
+        {
+            continue;
+        }
+
+        uint32_t page_size = std::stoi(node.at(kParams).at(kPageSize).get<std::string>());
+        uint32_t num_of_cores = std::stoi(node.at(kParams).at(kNumCores).get<std::string>());
+        if (num_of_cores == 0)
+        {
+            num_of_cores = interleaved_storage_cores;
+        }
+
+        if (node.at(kNodeType) == kNodeBufferAllocate)
+        {
+            current_size_per_core += detail::worst_case_per_core_allocation(std::stoi(node.at(kParams).at(kSize).get<std::string>()), page_size, num_of_cores);
+            peak_size_per_core = std::max(peak_size_per_core, current_size_per_core);
+        }
+        else // kNodeBufferDeallocate
+        {
+            current_size_per_core -= detail::worst_case_per_core_allocation(std::stoi(node.at(kParams).at(kSize).get<std::string>()), page_size, num_of_cores);
+        }
+    }
+
+    return peak_size_per_core;
+}
+
 }  // namespace ttnn::graph
diff --git a/ttnn/cpp/ttnn/graph/graph_trace_utils.hpp b/ttnn/cpp/ttnn/graph/graph_trace_utils.hpp
@@ -12,6 +12,8 @@
 namespace ttnn::graph {
 
 uint32_t extract_peak_L1_memory_usage(const nlohmann::json& trace);
+uint32_t extract_l1_output_buffer_allocation_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores);
+uint32_t extract_l1_buffer_allocation_peak_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores);
 
 // Returns count of intermediate and output tensors
 std::pair<uint32_t, uint32_t> count_intermediate_and_output_tensors(const nlohmann::json& trace);