Skip to content

Commit

Permalink
GraphProcessor extension to expose page size and num cores (#14903)
Browse files Browse the repository at this point in the history
This change is part of larger change to support compiler

### Problem description
For L1 allocated buffers, using graph processor, we want to be able to
know what is per core L1 usage.

### What's changed
Addded GraphProcessor functionality for
buffer_allocation/buffer_deallocation nodes to capture page size and
number of cores (for sharded buffer allocations)

Extend graph utilities with
* extract_l1_output_buffer_allocation_size_per_core
* extract_l1_buffer_allocation_peak_size_per_core

Extended graph_test_add to test this functionality. Also heavy tested
locally with different ops and options.


### Checklist
- [ ] Post commit CI passes
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] New/Existing tests provide coverage for changes
  • Loading branch information
mbezuljTT authored Nov 9, 2024
1 parent 03998d5 commit 37480d8
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 2 deletions.
17 changes: 17 additions & 0 deletions tests/ttnn/unit_tests/gtests/test_graph_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ struct AddOpGraphTestParam {
std::vector<std::string> expected_calltrace;
uint32_t expected_peak_L1_memory_usage = 0;
uint32_t expected_intermediate_tensors_count = 0;
uint32_t expected_l1_output_per_core = 0;
uint32_t expected_l1_peak_per_core = 0;
std::vector<graph::TensorInfo> expected_output_info;
};

Expand Down Expand Up @@ -70,6 +72,17 @@ TEST_P(AddOpGraphTestFixture, AddGraphTrace) {
EXPECT_EQ(output_tensors_count, 1);
}

// per core buffer allocation size
{
auto compute_with_storage_grid_size = this->getDevice().compute_with_storage_grid_size();
size_t interleaved_storage_cores = compute_with_storage_grid_size.x * compute_with_storage_grid_size.y;

auto l1_output_per_core = graph::extract_l1_output_buffer_allocation_size_per_core(json_trace, interleaved_storage_cores);
EXPECT_EQ(l1_output_per_core, params.expected_l1_output_per_core);
auto l1_peak_per_core = graph::extract_l1_buffer_allocation_peak_size_per_core(json_trace, interleaved_storage_cores);
EXPECT_EQ(l1_peak_per_core, params.expected_l1_peak_per_core);
}

// Query calls
{
auto peak_L1_memory_usage = graph::query_peak_L1_memory_usage(call);
Expand Down Expand Up @@ -115,6 +128,8 @@ INSTANTIATE_TEST_SUITE_P(
.expected_calltrace = { "ttnn::add", "ttnn::prim::binary", "BinaryDeviceOperation", "tt::tt_metal::create_device_tensor" },
.expected_peak_L1_memory_usage = 30720,
.expected_intermediate_tensors_count = 0,
.expected_l1_output_per_core = 2048,
.expected_l1_peak_per_core = 2048,
.expected_output_info = {
graph::TensorInfo{
.shape = ttnn::Shape(tt::tt_metal::Array4D{1, 3, 32, 32}),
Expand All @@ -129,6 +144,8 @@ INSTANTIATE_TEST_SUITE_P(
.expected_calltrace = { "ttnn::add", "ttnn::repeat", "ttnn::prim::old_infra_device_operation", "RepeatDeviceOperation", "tt::tt_metal::create_device_tensor", "ttnn::prim::binary", "BinaryDeviceOperation", "tt::tt_metal::create_device_tensor"},
.expected_peak_L1_memory_usage = 92160,
.expected_intermediate_tensors_count = 0,
.expected_l1_output_per_core = 2048,
.expected_l1_peak_per_core = 2*2048,
.expected_output_info = {
graph::TensorInfo{
.shape = ttnn::Shape(tt::tt_metal::Array4D{4, 3, 32, 32}),
Expand Down
2 changes: 2 additions & 0 deletions ttnn/cpp/ttnn/graph/graph_consts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ namespace ttnn::graph {
constexpr auto kSize = "size";
constexpr auto kLayout= "layout";
constexpr auto kShape = "shape";
constexpr auto kNumCores = "num_cores";
constexpr auto kPageSize = "page_size";

// node names
constexpr auto kNodeBuffer = "buffer";
Expand Down
8 changes: 6 additions & 2 deletions ttnn/cpp/ttnn/graph/graph_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ void GraphProcessor::track_allocate(const tt::tt_metal::Buffer* buffer) {
{kSize, std::to_string(buffer->size())},
{kAddress, std::to_string(buffer->address())},
{kType, buffer->is_dram() ? "DRAM" : "L1"},
{kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())}
{kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())},
{kPageSize, std::to_string(buffer->page_size())},
{kNumCores, std::to_string(buffer->num_cores().value_or(0))} // use 0 for interleaved
};
{
graph.push_back(Vertex{
Expand All @@ -122,7 +124,9 @@ void GraphProcessor::track_deallocate(tt::tt_metal::Buffer* buffer) {
std::unordered_map<std::string, std::string> params = {
{kSize, std::to_string(buffer->size())},
{kType, buffer->is_dram() ? "DRAM" : "L1"},
{kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())}
{kLayout, tensorMemoryLayoutToString(buffer->buffer_layout())},
{kPageSize, std::to_string(buffer->page_size())},
{kNumCores, std::to_string(buffer->num_cores().value_or(0))} // use 0 for interleaved
};
{
graph.push_back(Vertex{
Expand Down
125 changes: 125 additions & 0 deletions ttnn/cpp/ttnn/graph/graph_trace_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,129 @@ std::vector<TensorInfo> extract_output_info(const nlohmann::json& trace) {
}


namespace detail {
// This function computes the worst-case memory allocation per core for a given total size, page size, and number of cores.
size_t worst_case_per_core_allocation(size_t total_size, size_t page_size, size_t num_of_cores) {
size_t pages = std::ceil(float(total_size) / page_size);
size_t pages_per_core = std::ceil(float(pages) / num_of_cores);
return pages_per_core * page_size;
}
}

// This function returns the worst-case memory allocation per core for the output L1 buffer. 0 for DRAM buffers.
uint32_t extract_l1_output_buffer_allocation_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores) {
// we are lookin for buffer_allocate that is connected to a buffer,
// that is connected to a same tensor as the function_end connected to capture_end
// buffer_allocate -> buffer -> tensor <- function_end -> capture_end

// Find the 'capture_end' node
const auto& capture_end_node =
std::find_if(trace.rbegin(), trace.rend(), [](const auto& v) { return v.at(kNodeType) == kNodeCaptureEnd; });

if (capture_end_node == trace.rend()) {
throw std::runtime_error("No capture_end node found in the trace");
}

// helper function to find a node of a specific type that points to a given node
auto find_a_first_node_of_a_type_pointing_to_me([&](const auto& trace, const char* node_type, const auto& my_node) {
return std::find_if(trace.begin(), trace.end(), [&](const auto& v) {
// check if v.at(kNodeType) starts with node_type because buffers and tensors have suffixes \"(counter)\"
std::string v_node_type = v.at(kNodeType).dump();
v_node_type.erase(std::remove(v_node_type.begin(), v_node_type.end(), '"'), v_node_type.end());

return (v_node_type.starts_with(node_type)) &&
(std::find(v.at(kConnections).begin(), v.at(kConnections).end(), my_node.at(kCounter)) !=
v.at(kConnections).end());
});
});

// helper function to find a node by counter
auto find_node_by_counter([&](const auto& trace, int counter) {
return std::find_if(trace.begin(), trace.end(), [&](const auto& v) { return v.at(kCounter) == counter; });
});

const auto& last_function_end_node =
find_a_first_node_of_a_type_pointing_to_me(trace, kNodeFunctionEnd, *capture_end_node);

if (last_function_end_node == trace.end()) {
throw std::runtime_error("No function_end node connected to capture_end found in the trace");
}

const auto& output_tensor_node = find_node_by_counter(trace, last_function_end_node->at(kConnections).at(0).get<int>());
if (output_tensor_node == trace.end()) {
throw std::runtime_error("No tensor node connected to function_end found in the trace");
}

const auto& output_buffer_node =
find_a_first_node_of_a_type_pointing_to_me(trace, kNodeBuffer, *output_tensor_node);
if (output_buffer_node == trace.end()) {
throw std::runtime_error("No buffer node connected to tensor found in the trace");
}

const auto& output_buffer_allocate_node =
find_a_first_node_of_a_type_pointing_to_me(trace, kNodeBufferAllocate, *output_buffer_node);
if (output_buffer_allocate_node == trace.end()) {
throw std::runtime_error("No buffer_allocate node connected to buffer found in the trace");
}


uint32_t output_buffer_allocate_total_size = std::stoi(output_buffer_allocate_node->at(kParams).at(kSize).get<std::string>());

// skip dram buffer allocation checks
if (output_buffer_allocate_node->at(kParams).at(kType) == "DRAM")
{
return 0;
}

uint32_t page_size = std::stoi(output_buffer_allocate_node->at(kParams).at(kPageSize).get<std::string>());
uint32_t num_of_cores = std::stoi(output_buffer_allocate_node->at(kParams).at(kNumCores).get<std::string>());
if (num_of_cores == 0)
{
num_of_cores = interleaved_storage_cores;
}

return detail::worst_case_per_core_allocation(output_buffer_allocate_total_size, page_size, num_of_cores);
}

// This function returns the worst-case memory allocation per core for the peak L1 usage. Ignores DRAM buffers.
uint32_t extract_l1_buffer_allocation_peak_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores) {

uint32_t current_size_per_core = 0;
uint32_t peak_size_per_core = 0;

for (const auto& node : trace)
{
// process only buffer allocation and deallocation nodes
if (node.at(kNodeType) != kNodeBufferAllocate && node.at(kNodeType) != kNodeBufferDeallocate)
{
continue;
}

// skip dram buffer allocation/deallocation
if (node.at(kParams).at(kType) == "DRAM")
{
continue;
}

uint32_t page_size = std::stoi(node.at(kParams).at(kPageSize).get<std::string>());
uint32_t num_of_cores = std::stoi(node.at(kParams).at(kNumCores).get<std::string>());
if (num_of_cores == 0)
{
num_of_cores = interleaved_storage_cores;
}

if (node.at(kNodeType) == kNodeBufferAllocate)
{
current_size_per_core += detail::worst_case_per_core_allocation(std::stoi(node.at(kParams).at(kSize).get<std::string>()), page_size, num_of_cores);
peak_size_per_core = std::max(peak_size_per_core, current_size_per_core);
}
else // kNodeBufferDeallocate
{
current_size_per_core -= detail::worst_case_per_core_allocation(std::stoi(node.at(kParams).at(kSize).get<std::string>()), page_size, num_of_cores);
}
}

return peak_size_per_core;
}

} // namespace ttnn::graph
2 changes: 2 additions & 0 deletions ttnn/cpp/ttnn/graph/graph_trace_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
namespace ttnn::graph {

uint32_t extract_peak_L1_memory_usage(const nlohmann::json& trace);
uint32_t extract_l1_output_buffer_allocation_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores);
uint32_t extract_l1_buffer_allocation_peak_size_per_core(const nlohmann::json& trace, size_t interleaved_storage_cores);

// Returns count of intermediate and output tensors
std::pair<uint32_t, uint32_t> count_intermediate_and_output_tensors(const nlohmann::json& trace);
Expand Down

0 comments on commit 37480d8

Please sign in to comment.