Skip to content

Commit

Permalink
#4269: Supports specifying up to 2 hardware command queues on a singl…
Browse files Browse the repository at this point in the history
…e device
  • Loading branch information
DrJessop committed Jan 3, 2024
1 parent baa6280 commit 5b90c1c
Show file tree
Hide file tree
Showing 34 changed files with 1,016 additions and 415 deletions.
7 changes: 4 additions & 3 deletions tests/scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ run_post_commit_pipeline_tests() {
# run_module_tests "$tt_arch" "llrt" "$pipeline_type"
if [[ $dispatch_mode == "slow" ]]; then
./tests/scripts/run_pre_post_commit_regressions_slow_dispatch.sh
else
elif [[ $dispatch_mode == "fast" ]]; then
./tests/scripts/run_pre_post_commit_regressions_fast_dispatch.sh
elif [[ $dispatch_mode == "fast-multi-queue-single-device" ]]; then
TT_METAL_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue --gtest_filter=MultiCommandQueueSingleDeviceFixture.*
fi
}

Expand Down Expand Up @@ -167,7 +169,6 @@ run_pipeline_tests() {

# Add your logic here for pipeline-specific tests
echo "Running tests for pipeline: $pipeline_type with tt-arch: $tt_arch"

# Call the appropriate module tests based on pipeline
if [[ $pipeline_type == "post_commit" ]]; then
run_post_commit_pipeline_tests "$tt_arch" "$pipeline_type" "$dispatch_mode"
Expand Down Expand Up @@ -261,7 +262,7 @@ main() {
dispatch_mode=${dispatch_mode:-$default_dispatch_mode}
pipeline_type=${pipeline_type:-$default_pipeline_type}

available_dispatch_modes=("fast" "slow")
available_dispatch_modes=("fast" "slow" "fast-multi-queue-single-device")
available_tt_archs=("grayskull" "wormhole_b0")

# Validate arguments
Expand Down
3 changes: 2 additions & 1 deletion tests/tt_metal/tt_metal/module.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include $(TT_METAL_HOME)/tests/tt_metal/tt_metal/unit_tests_common/module.mk
include $(TT_METAL_HOME)/tests/tt_metal/tt_metal/unit_tests/module.mk
include $(TT_METAL_HOME)/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/module.mk
include $(TT_METAL_HOME)/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/module.mk
# Programming examples for external users
include $(TT_METAL_HOME)/tt_metal/programming_examples/module.mk

Expand Down Expand Up @@ -77,7 +78,7 @@ TT_METAL_TESTS_DEPS = $(addprefix $(OBJDIR)/, $(TT_METAL_TESTS_SRCS:.cpp=.d))
-include $(TT_METAL_TESTS_DEPS)

# Each module has a top level target as the entrypoint which must match the subdir name
tests/tt_metal: $(TT_METAL_TESTS) programming_examples tests/tt_metal/unit_tests tests/tt_metal/unit_tests_fast_dispatch
tests/tt_metal: $(TT_METAL_TESTS) programming_examples tests/tt_metal/unit_tests tests/tt_metal/unit_tests_fast_dispatch tests/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue
tests/tt_metal/all: $(TT_METAL_TESTS)
tests/tt_metal/%: $(TESTDIR)/tt_metal/% ;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
#include "tt_metal/tools/profiler/op_profiler.hpp"

using namespace tt;

//
void measure_latency(string kernel_name) {
const int device_id = 0;
tt_metal::Device *device = tt_metal::CreateDevice(device_id);

auto dispatch_cores = device->dispatch_cores().begin();
CoreCoord producer_logical_core = *dispatch_cores++;
CoreCoord consumer_logical_core = *dispatch_cores;
// auto dispatch_cores = device->dispatch_cores().begin();
CoreCoord producer_logical_core = {0, 0};//*dispatch_cores++;
CoreCoord consumer_logical_core = {0, 0}; //*dispatch_cores;

auto first_worker_physical_core = device->worker_core_from_logical_core({0, 0});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ using namespace tt::tt_metal;

namespace host_tests {

TEST_F(MultiCommandQueueFixture, TestAccessCommandQueue) {
namespace multi_device_tests {
TEST_F(CommandQueueMultiDeviceFixture, TestAccessCommandQueue) {
for (unsigned int device_id = 0; device_id < num_devices_; device_id++) {
EXPECT_NO_THROW(detail::GetCommandQueue(devices_[device_id]));
}
Expand All @@ -36,7 +37,7 @@ TEST(FastDispatchHostSuite, TestCannotAccessCommandQueueForClosedDevice) {
EXPECT_ANY_THROW(detail::GetCommandQueue(device));
}

TEST_F(MultiCommandQueueFixture, TestDirectedLoopbackToUniqueHugepage) {
TEST_F(CommandQueueMultiDeviceFixture, TestDirectedLoopbackToUniqueHugepage) {
std::unordered_map<chip_id_t, std::vector<uint32_t>> golden_data;

const uint32_t byte_size = 2048 * 16;
Expand All @@ -62,5 +63,9 @@ TEST_F(MultiCommandQueueFixture, TestDirectedLoopbackToUniqueHugepage) {
EXPECT_EQ(readback_data, golden_data.at(device_id));
}
}
}




} // namespace host_tests
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,6 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_sharded(

bool test_EnqueueWrap_on_EnqueueReadBuffer(Device* device, CommandQueue& cq, const TestBufferConfig& config) {
auto [buffer, src] = EnqueueWriteBuffer_prior_to_wrap(device, cq, config);

vector<uint32_t> dst;
EnqueueReadBuffer(cq, buffer, dst, true);

Expand Down Expand Up @@ -260,7 +259,6 @@ namespace dram_tests {

TEST_F(CommandQueueFixture, WriteOneTileToDramBank0) {
TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};

EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
}

Expand Down Expand Up @@ -329,8 +327,8 @@ TEST_F(CommandQueueFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
uint32_t num_pages = buffer_size / page_size;

TestBufferConfig buf_config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};

EXPECT_TRUE(local_test_functions::test_EnqueueWrap_on_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), buf_config));
CommandQueue a(this->device_, 0);
EXPECT_TRUE(local_test_functions::test_EnqueueWrap_on_EnqueueReadBuffer(this->device_, a, buf_config));
}

TEST_F(CommandQueueFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
Expand Down Expand Up @@ -420,7 +418,6 @@ namespace l1_tests {

TEST_F(CommandQueueFixture, WriteOneTileToL1Bank0) {
TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1};

EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(this->device_, tt::tt_metal::detail::GetCommandQueue(device_), config));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class MultiCommandQueueFixture : public ::testing::Test {
size_t num_devices_;
};

class MultiCommandQueueFixture : public ::testing::Test {
class CommandQueueMultiDeviceFixture : public ::testing::Test {
protected:
void SetUp() override {
auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
Expand All @@ -73,8 +73,6 @@ class MultiCommandQueueFixture : public ::testing::Test {
}
arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name());

num_devices_ = tt::tt_metal::GetNumAvailableDevices();

for (unsigned int id = 0; id < num_devices_; id++) {
auto* device = tt::tt_metal::CreateDevice(id);
devices_.push_back(device);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <memory>

#include "command_queue_fixture.hpp"
#include "gtest/gtest.h"
#include "tt_metal/host_api.hpp"
#include "tt_metal/test_utils/env_vars.hpp"
#include "tt_metal/detail/tt_metal.hpp"

using namespace tt::tt_metal;

struct TestBufferConfig {
uint32_t num_pages;
uint32_t page_size;
BufferType buftype;
};

struct BufferStressTestConfig {
// Used for normal write/read tests
uint32_t seed;
uint32_t num_pages_total;

uint32_t page_size;
uint32_t max_num_pages_per_buffer;

// Used for wrap test
uint32_t num_iterations;
uint32_t num_unique_vectors;
};

namespace local_test_functions {

vector<uint32_t> generate_arange_vector(uint32_t size_bytes) {
TT_FATAL(size_bytes % sizeof(uint32_t) == 0);
vector<uint32_t> src(size_bytes / sizeof(uint32_t), 0);

for (uint32_t i = 0; i < src.size(); i++) {
src.at(i) = i;
}
return src;
}

bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, vector<std::reference_wrapper<CommandQueue>>& cqs, const TestBufferConfig& config) {
bool pass = true;
for (const bool use_void_star_api: {true, false}) {

size_t buf_size = config.num_pages * config.page_size;
vector<unique_ptr<Buffer>> buffers;
vector<vector<uint32_t>> srcs;
for (uint i = 0; i < cqs.size(); i++) {
buffers.push_back(std::make_unique<Buffer>(device, buf_size, config.page_size, config.buftype));
srcs.push_back(generate_arange_vector(buffers[i]->size()));
if (use_void_star_api) {
EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i].data(), false);
} else {
EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false);
}
}

for (uint i = 0; i < cqs.size(); i++) {
vector<uint32_t> result;
if (use_void_star_api) {
result.resize(buf_size / sizeof(uint32_t));
EnqueueReadBuffer(cqs[i], *buffers[i], result.data(), true);
} else {
EnqueueReadBuffer(cqs[i], *buffers[i], result, true);
}
bool local_pass = (srcs[i] == result);
pass &= local_pass;
}
}

return pass;
}
}


namespace basic_tests {
namespace dram_tests {

TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToDramBank0) {
TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllDramBanks) {
TestBufferConfig config = {
.num_pages = uint32_t(this->device_->num_banks(BufferType::DRAM)),
.page_size = 2048,
.buftype = BufferType::DRAM};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
constexpr uint32_t num_round_robins = 2;
TestBufferConfig config = {
.num_pages = num_round_robins * (this->device_->num_banks(BufferType::DRAM)),
.page_size = 2048,
.buftype = BufferType::DRAM};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, Sending131072Pages) {
// Was a failing case where we used to accidentally program cb num pages to be total
// pages instead of cb num pages.
TestBufferConfig config = {
.num_pages = 131072,
.page_size = 128,
.buftype = BufferType::DRAM};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForDram) {
TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForDram2) {
// From stable diffusion read buffer
TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, TestPageSizeTooLarge) {
if (this->arch_ == tt::ARCH::WORMHOLE_B0) {
GTEST_SKIP(); // This test hanging on wormhole b0
}
// Should throw a host error due to the page size not fitting in the consumer CB
TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
uint32_t page_size = 2048;
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id());
uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel);
uint32_t num_pages = command_queue_size / page_size;

TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}


} // end namespace dram_tests

namespace l1_tests {

TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToL1Bank0) {
TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1};
CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllL1Banks) {
auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size();
TestBufferConfig config = {
.num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
.page_size = 2048,
.buftype = BufferType::L1};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size();
TestBufferConfig config = {
.num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
.page_size = 2048,
.buftype = BufferType::L1};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForL1) {
TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1};

CommandQueue a(this->device_, 0);
CommandQueue b(this->device_, 1);
vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
}

} // end namespace l1_tests
} // end namespace basic_tests
Loading

0 comments on commit 5b90c1c

Please sign in to comment.