diff --git a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp index 5903126db3a..9744b7a5f04 100644 --- a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp +++ b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp @@ -34,6 +34,8 @@ struct AddOpGraphTestParam { std::vector expected_calltrace; uint32_t expected_peak_L1_memory_usage = 0; uint32_t expected_intermediate_tensors_count = 0; + uint32_t expected_l1_output_per_core = 0; + uint32_t expected_l1_peak_per_core = 0; std::vector expected_output_info; }; @@ -70,6 +72,17 @@ TEST_P(AddOpGraphTestFixture, AddGraphTrace) { EXPECT_EQ(output_tensors_count, 1); } + // per core buffer allocation size + { + auto compute_with_storage_grid_size = this->getDevice().compute_with_storage_grid_size(); + size_t interleaved_storage_cores = compute_with_storage_grid_size.x * compute_with_storage_grid_size.y; + + auto l1_output_per_core = graph::extract_l1_output_buffer_allocation_size_per_core(json_trace, interleaved_storage_cores); + EXPECT_EQ(l1_output_per_core, params.expected_l1_output_per_core); + auto l1_peak_per_core = graph::extract_l1_buffer_allocation_peak_size_per_core(json_trace, interleaved_storage_cores); + EXPECT_EQ(l1_peak_per_core, params.expected_l1_peak_per_core); + } + // Query calls { auto peak_L1_memory_usage = graph::query_peak_L1_memory_usage(call); @@ -115,6 +128,8 @@ INSTANTIATE_TEST_SUITE_P( .expected_calltrace = { "ttnn::add", "ttnn::prim::binary", "BinaryDeviceOperation", "tt::tt_metal::create_device_tensor" }, .expected_peak_L1_memory_usage = 30720, .expected_intermediate_tensors_count = 0, + .expected_l1_output_per_core = 2048, + .expected_l1_peak_per_core = 2048, .expected_output_info = { graph::TensorInfo{ .shape = ttnn::Shape(tt::tt_metal::Array4D{1, 3, 32, 32}), @@ -129,6 +144,8 @@ INSTANTIATE_TEST_SUITE_P( .expected_calltrace = { "ttnn::add", "ttnn::repeat", "ttnn::prim::old_infra_device_operation", "RepeatDeviceOperation", "tt::tt_metal::create_device_tensor", "ttnn::prim::binary", "BinaryDeviceOperation", "tt::tt_metal::create_device_tensor"}, .expected_peak_L1_memory_usage = 92160, .expected_intermediate_tensors_count = 0, + .expected_l1_output_per_core = 2048, + .expected_l1_peak_per_core = 2*2048, .expected_output_info = { graph::TensorInfo{ .shape = ttnn::Shape(tt::tt_metal::Array4D{4, 3, 32, 32}), diff --git a/ttnn/cpp/ttnn/graph/graph_consts.hpp b/ttnn/cpp/ttnn/graph/graph_consts.hpp index 3e26a6aa325..ff8736d2f6a 100644 --- a/ttnn/cpp/ttnn/graph/graph_consts.hpp +++ b/ttnn/cpp/ttnn/graph/graph_consts.hpp @@ -19,6 +19,8 @@ namespace ttnn::graph { constexpr auto kSize = "size"; constexpr auto kLayout= "layout"; constexpr auto kShape = "shape"; + constexpr auto kNumCores = "num_cores"; + constexpr auto kPageSize = "page_size"; // node names constexpr auto kNodeBuffer = "buffer"; diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp index fbbf33acc77..b75d438e1cf 100644 --- a/ttnn/cpp/ttnn/graph/graph_processor.cpp +++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp @@ -102,7 +102,9 @@ void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) { {kSize, std::to_string(buffer->size())}, {kAddress, std::to_string(buffer->address())}, {kType, buffer->is_dram() ? "DRAM" : "L1"}, - {kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())} + {kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())}, + {kPageSize, std::to_string(buffer->page_size())}, + {kNumCores, std::to_string(buffer->num_cores().value_or(0))} // use 0 for interleaved }; { graph.push_back(Vertex{ @@ -122,7 +124,9 @@ void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) { std::unordered_map params = { {kSize, std::to_string(buffer->size())}, {kType, buffer->is_dram() ? "DRAM" : "L1"}, - {kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())} + {kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())}, + {kPageSize, std::to_string(buffer->page_size())}, + {kNumCores, std::to_string(buffer->num_cores().value_or(0))} // use 0 for interleaved }; { graph.push_back(Vertex{ diff --git a/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp b/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp index 91d970861c2..089aceece15 100644 --- a/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp +++ b/ttnn/cpp/ttnn/graph/graph_trace_utils.cpp @@ -214,4 +214,129 @@ std::vector extract_output_info(const nlohmann::json& trace) { } +namespace detail { + // This function computes the worst-case memory allocation per core for a given total size, page size, and number of cores. + size_t worst_case_per_core_allocation(size_t total_size, size_t page_size, size_t num_of_cores) { + size_t pages = std::ceil(float(total_size) / page_size); + size_t pages_per_core = std::ceil(float(pages) / num_of_cores); + return pages_per_core * page_size; + } +} + +// This function returns the worst-case memory allocation per core for the output L1 buffer. 0 for DRAM buffers. +uint32_t extract_l1_output_buffer_allocation_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores) { + // we are lookin for buffer_allocate that is connected to a buffer, + // that is connected to a same tensor as the function_end connected to capture_end + // buffer_allocate -> buffer -> tensor <- function_end -> capture_end + + // Find the 'capture_end' node + const auto& capture_end_node = + std::find_if(trace.rbegin(), trace.rend(), [](const auto& v) { return v.at(kNodeType) == kNodeCaptureEnd; }); + + if (capture_end_node == trace.rend()) { + throw std::runtime_error("No capture_end node found in the trace"); + } + + // helper function to find a node of a specific type that points to a given node + auto find_a_first_node_of_a_type_pointing_to_me([&](const auto& trace, const char* node_type, const auto& my_node) { + return std::find_if(trace.begin(), trace.end(), [&](const auto& v) { + // check if v.at(kNodeType) starts with node_type because buffers and tensors have suffixes \"(counter)\" + std::string v_node_type = v.at(kNodeType).dump(); + v_node_type.erase(std::remove(v_node_type.begin(), v_node_type.end(), '"'), v_node_type.end()); + + return (v_node_type.starts_with(node_type)) && + (std::find(v.at(kConnections).begin(), v.at(kConnections).end(), my_node.at(kCounter)) != + v.at(kConnections).end()); + }); + }); + + // helper function to find a node by counter + auto find_node_by_counter([&](const auto& trace, int counter) { + return std::find_if(trace.begin(), trace.end(), [&](const auto& v) { return v.at(kCounter) == counter; }); + }); + + const auto& last_function_end_node = + find_a_first_node_of_a_type_pointing_to_me(trace, kNodeFunctionEnd, *capture_end_node); + + if (last_function_end_node == trace.end()) { + throw std::runtime_error("No function_end node connected to capture_end found in the trace"); + } + + const auto& output_tensor_node = find_node_by_counter(trace, last_function_end_node->at(kConnections).at(0).get()); + if (output_tensor_node == trace.end()) { + throw std::runtime_error("No tensor node connected to function_end found in the trace"); + } + + const auto& output_buffer_node = + find_a_first_node_of_a_type_pointing_to_me(trace, kNodeBuffer, *output_tensor_node); + if (output_buffer_node == trace.end()) { + throw std::runtime_error("No buffer node connected to tensor found in the trace"); + } + + const auto& output_buffer_allocate_node = + find_a_first_node_of_a_type_pointing_to_me(trace, kNodeBufferAllocate, *output_buffer_node); + if (output_buffer_allocate_node == trace.end()) { + throw std::runtime_error("No buffer_allocate node connected to buffer found in the trace"); + } + + + uint32_t output_buffer_allocate_total_size = std::stoi(output_buffer_allocate_node->at(kParams).at(kSize).get()); + + // skip dram buffer allocation checks + if (output_buffer_allocate_node->at(kParams).at(kType) == "DRAM") + { + return 0; + } + + uint32_t page_size = std::stoi(output_buffer_allocate_node->at(kParams).at(kPageSize).get()); + uint32_t num_of_cores = std::stoi(output_buffer_allocate_node->at(kParams).at(kNumCores).get()); + if (num_of_cores == 0) + { + num_of_cores = interleaved_storage_cores; + } + + return detail::worst_case_per_core_allocation(output_buffer_allocate_total_size, page_size, num_of_cores); +} + +// This function returns the worst-case memory allocation per core for the peak L1 usage. Ignores DRAM buffers. +uint32_t extract_l1_buffer_allocation_peak_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores) { + + uint32_t current_size_per_core = 0; + uint32_t peak_size_per_core = 0; + + for (const auto& node : trace) + { + // process only buffer allocation and deallocation nodes + if (node.at(kNodeType) != kNodeBufferAllocate && node.at(kNodeType) != kNodeBufferDeallocate) + { + continue; + } + + // skip dram buffer allocation/deallocation + if (node.at(kParams).at(kType) == "DRAM") + { + continue; + } + + uint32_t page_size = std::stoi(node.at(kParams).at(kPageSize).get()); + uint32_t num_of_cores = std::stoi(node.at(kParams).at(kNumCores).get()); + if (num_of_cores == 0) + { + num_of_cores = interleaved_storage_cores; + } + + if (node.at(kNodeType) == kNodeBufferAllocate) + { + current_size_per_core += detail::worst_case_per_core_allocation(std::stoi(node.at(kParams).at(kSize).get()), page_size, num_of_cores); + peak_size_per_core = std::max(peak_size_per_core, current_size_per_core); + } + else // kNodeBufferDeallocate + { + current_size_per_core -= detail::worst_case_per_core_allocation(std::stoi(node.at(kParams).at(kSize).get()), page_size, num_of_cores); + } + } + + return peak_size_per_core; +} + } // namespace ttnn::graph diff --git a/ttnn/cpp/ttnn/graph/graph_trace_utils.hpp b/ttnn/cpp/ttnn/graph/graph_trace_utils.hpp index fa110d40cc2..b7eedc27133 100644 --- a/ttnn/cpp/ttnn/graph/graph_trace_utils.hpp +++ b/ttnn/cpp/ttnn/graph/graph_trace_utils.hpp @@ -12,6 +12,8 @@ namespace ttnn::graph { uint32_t extract_peak_L1_memory_usage(const nlohmann::json& trace); +uint32_t extract_l1_output_buffer_allocation_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores); +uint32_t extract_l1_buffer_allocation_peak_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores); // Returns count of intermediate and output tensors std::pair count_intermediate_and_output_tensors(const nlohmann::json& trace);