Skip to content

Commit

Permalink
Move a sub_device test
Browse files Browse the repository at this point in the history
  • Loading branch information
sagarwalTT committed Nov 14, 2024
1 parent 5907587 commit b81a355
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 91 deletions.
2 changes: 1 addition & 1 deletion tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@ TEST(WorkerConfigBuffer, SmallSize) {
}
}

} // namespace working_config_buffer_tests
} // namespace worker_config_buffer_tests
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(UNIT_TESTS_DISPATCH_BUFFER_SRC
${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp
CACHE INTERNAL
""
)
Expand Down
110 changes: 110 additions & 0 deletions tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstddef>
#include <cstdint>
#include <array>
#include <tuple>
#include <vector>

#include "gtest/gtest.h"
#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/impl/buffers/global_semaphore.hpp"
#include "tt_metal/impl/device/device.hpp"
#include "tt_metal/impl/event/event.hpp"
#include "tt_metal/impl/sub_device/sub_device.hpp"
#include "test_utils/stimulus.hpp"
#include "command_queue_fixture.hpp"
#include "command_queue_test_utils.hpp"
#include "sub_device_test_utils.hpp"

TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
uint32_t local_l1_size = 3200;
SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2});
CoreRangeSet sharded_cores_2 = CoreRange({4, 4}, {4, 4});

auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true);
auto sharded_cores_2_vec = corerange_to_cores(sharded_cores_2, std::nullopt, true);

ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1});
uint32_t page_size_1 = 32;
ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1};
auto input_1 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_1.size / sizeof(uint32_t));

ShardSpecBuffer shard_spec_buffer_2 = ShardSpecBuffer(sharded_cores_2, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_2.num_cores(), 1});
uint32_t page_size_2 = 64;
ShardedBufferConfig shard_config_2 = {nullptr, sharded_cores_2.num_cores() * page_size_2, page_size_2, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_2};
auto input_2 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_2.size / sizeof(uint32_t));

uint32_t page_size_3 = 1024;
InterleavedBufferConfig interleaved_config = {nullptr, page_size_3, page_size_3, BufferType::L1, TensorMemoryLayout::INTERLEAVED};
auto input_3 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, interleaved_config.size / sizeof(uint32_t));

for (Device *device : devices_) {
auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1}, local_l1_size);
auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size);
DeviceAddr l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
DeviceAddr max_addr = l1_unreserved_base + local_l1_size;

shard_config_1.device = device;
shard_config_2.device = device;
interleaved_config.device = device;

std::vector<CoreCoord> physical_cores_1;
physical_cores_1.reserve(sharded_cores_1_vec.size());
for (const auto& core : sharded_cores_1_vec) {
physical_cores_1.push_back(device->worker_core_from_logical_core(core));
}

std::vector<CoreCoord> physical_cores_2;
physical_cores_2.reserve(sharded_cores_2_vec.size());
for (const auto& core : sharded_cores_2_vec) {
physical_cores_2.push_back(device->worker_core_from_logical_core(core));
}

device->load_sub_device_manager(sub_device_manager_1);

auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
EXPECT_EQ(buffer_1->address(), max_addr - page_size_1);
EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
std::vector<uint32_t> output_1;
EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
EXPECT_EQ(input_1, output_1);
auto input_1_it = input_1.begin();
for (const auto& physical_core : physical_cores_1) {
auto readback = tt::llrt::read_hex_vec_from_core(
device->id(), physical_core, buffer_1->address(), page_size_1);
EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin()));
input_1_it += page_size_1 / sizeof(uint32_t);
}

auto buffer_2 = CreateBuffer(interleaved_config);
EXPECT_THROW(CreateBuffer(shard_config_1, SubDeviceId{1}), std::exception);
EXPECT_THROW(device->clear_loaded_sub_device_manager(), std::exception);
EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception);
DeallocateBuffer(*buffer_1);
device->clear_loaded_sub_device_manager();
device->load_sub_device_manager(sub_device_manager_2);

auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
EXPECT_EQ(buffer_3->address(), max_addr - page_size_2);
EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
std::vector<uint32_t> output_2;
EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
EXPECT_EQ(input_2, output_2);
auto input_2_it = input_2.begin();
for (const auto& physical_core : physical_cores_2) {
auto readback = tt::llrt::read_hex_vec_from_core(
device->id(), physical_core, buffer_3->address(), page_size_2);
EXPECT_TRUE(std::equal(input_2_it, input_2_it + page_size_2 / sizeof(uint32_t), readback.begin()));
input_2_it += page_size_2 / sizeof(uint32_t);
}

auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0});
EXPECT_EQ(buffer_4->address(), max_addr - page_size_1);
EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,96 +19,6 @@
#include "command_queue_test_utils.hpp"
#include "sub_device_test_utils.hpp"

TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
uint32_t local_l1_size = 3200;
SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2});
CoreRangeSet sharded_cores_2 = CoreRange({4, 4}, {4, 4});

auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true);
auto sharded_cores_2_vec = corerange_to_cores(sharded_cores_2, std::nullopt, true);

ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1});
uint32_t page_size_1 = 32;
ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1};
auto input_1 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_1.size / sizeof(uint32_t));

ShardSpecBuffer shard_spec_buffer_2 = ShardSpecBuffer(sharded_cores_2, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_2.num_cores(), 1});
uint32_t page_size_2 = 64;
ShardedBufferConfig shard_config_2 = {nullptr, sharded_cores_2.num_cores() * page_size_2, page_size_2, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_2};
auto input_2 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_2.size / sizeof(uint32_t));

uint32_t page_size_3 = 1024;
InterleavedBufferConfig interleaved_config = {nullptr, page_size_3, page_size_3, BufferType::L1, TensorMemoryLayout::INTERLEAVED};
auto input_3 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, interleaved_config.size / sizeof(uint32_t));

for (Device *device : devices_) {
auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1}, local_l1_size);
auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size);
DeviceAddr l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
DeviceAddr max_addr = l1_unreserved_base + local_l1_size;

shard_config_1.device = device;
shard_config_2.device = device;
interleaved_config.device = device;

std::vector<CoreCoord> physical_cores_1;
physical_cores_1.reserve(sharded_cores_1_vec.size());
for (const auto& core : sharded_cores_1_vec) {
physical_cores_1.push_back(device->worker_core_from_logical_core(core));
}

std::vector<CoreCoord> physical_cores_2;
physical_cores_2.reserve(sharded_cores_2_vec.size());
for (const auto& core : sharded_cores_2_vec) {
physical_cores_2.push_back(device->worker_core_from_logical_core(core));
}

device->load_sub_device_manager(sub_device_manager_1);

auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
EXPECT_EQ(buffer_1->address(), max_addr - page_size_1);
EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
std::vector<uint32_t> output_1;
EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
EXPECT_EQ(input_1, output_1);
auto input_1_it = input_1.begin();
for (const auto& physical_core : physical_cores_1) {
auto readback = tt::llrt::read_hex_vec_from_core(
device->id(), physical_core, buffer_1->address(), page_size_1);
EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin()));
input_1_it += page_size_1 / sizeof(uint32_t);
}

auto buffer_2 = CreateBuffer(interleaved_config);
EXPECT_THROW(CreateBuffer(shard_config_1, SubDeviceId{1}), std::exception);
EXPECT_THROW(device->clear_loaded_sub_device_manager(), std::exception);
EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception);
DeallocateBuffer(*buffer_1);
device->clear_loaded_sub_device_manager();
device->load_sub_device_manager(sub_device_manager_2);

auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
EXPECT_EQ(buffer_3->address(), max_addr - page_size_2);
EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
std::vector<uint32_t> output_2;
EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
EXPECT_EQ(input_2, output_2);
auto input_2_it = input_2.begin();
for (const auto& physical_core : physical_cores_2) {
auto readback = tt::llrt::read_hex_vec_from_core(
device->id(), physical_core, buffer_3->address(), page_size_2);
EXPECT_TRUE(std::equal(input_2_it, input_2_it + page_size_2 / sizeof(uint32_t), readback.begin()));
input_2_it += page_size_2 / sizeof(uint32_t);
}

auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0});
EXPECT_EQ(buffer_4->address(), max_addr - page_size_1);
EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
}
}

TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceSynchronization) {
uint32_t local_l1_size = 3200;
SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
Expand Down

0 comments on commit b81a355

Please sign in to comment.