Skip to content

Commit

Permalink
#2934: Replace GLOBAL_CQ with one CommandQueue per device and move sy…
Browse files Browse the repository at this point in the history
…smem writer (HW CQ) to be owned by Device

assign each device to specific host mem channel
  • Loading branch information
abhullar-tt committed Dec 5, 2023
1 parent c872e15 commit 8914d1c
Show file tree
Hide file tree
Showing 39 changed files with 294 additions and 444 deletions.
2 changes: 1 addition & 1 deletion docs/source/tt_metal/examples/dram_loopback.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Program pre-compilation setup

.. code-block:: cpp
CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
CommandQueue& cq = detail::GetCommandQueue(device);
Program program = CreateProgram();
We first obtain the global ``CommandQueue`` in order to use the fast dispatch
Expand Down
2 changes: 1 addition & 1 deletion docs/source/tt_metal/examples/matmul_single_core.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ core at (0, 0).

.. code-block:: cpp
CommandQueue& cq = *detail::GLOBAL_CQ;
CommandQueue& cq = detail::GetCommandQueue(device);
Program program{};
CoreRange core = {.start={0, 0}, .end={0, 0}};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1197,8 +1197,8 @@ int main(int argc, char** argv) {

// took from run_operation.cpp
auto start = std::chrono::high_resolution_clock::now();
EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
Finish(*::detail::GLOBAL_CQ);
EnqueueProgram(::detail::GetCommandQueue(device), program, false);
Finish(::detail::GetCommandQueue(device));
auto end = std::chrono::high_resolution_clock::now();
duration = end - start;
tt_metal::DumpDeviceProfileResults(device, program);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,8 @@ int main(int argc, char **argv) {
std::chrono::duration<double, std::nano> duration;
// took from run_operation.cpp
auto start = std::chrono::high_resolution_clock::now();
EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
Finish(*::detail::GLOBAL_CQ);
EnqueueProgram(::detail::GetCommandQueue(device), program, false);
Finish(::detail::GetCommandQueue(device));
auto end = std::chrono::high_resolution_clock::now();
duration = end - start;
tt_metal::DumpDeviceProfileResults(device, program);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,8 @@ int main(int argc, char **argv) {

log_info(LogTest, "Running {} core test", num_cores_r * num_cores_c);
auto begin = std::chrono::steady_clock::now();
EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
Finish(*::detail::GLOBAL_CQ);
EnqueueProgram(::detail::GetCommandQueue(device), program, false);
Finish(::detail::GetCommandQueue(device));
auto end = std::chrono::steady_clock::now();
auto elapsed_us = duration_cast<microseconds>(end - begin).count();
auto bw = (total_tiles_size_bytes / 1024.0 / 1024.0 / 1024.0) /
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,8 @@ int main(int argc, char **argv) {

log_info(LogTest, "Running {} core test", num_cores_r * num_cores_c);
auto begin = std::chrono::steady_clock::now();
EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
Finish(*::detail::GLOBAL_CQ);
EnqueueProgram(::detail::GetCommandQueue(device), program, false);
Finish(::detail::GetCommandQueue(device));
auto end = std::chrono::steady_clock::now();
auto elapsed_us = duration_cast<microseconds>(end - begin).count();
auto bw = (total_tiles_size_bytes / 1024.0 / 1024.0 / 1024.0) /
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ int main(int argc, char** argv) {
// Device Setup
int device_id = 0;
tt_metal::Device* device = tt_metal::CreateDevice(device_id);
CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);

// Application Setup
uint32_t single_tile_size = 2 * 1024;
Expand Down
2 changes: 1 addition & 1 deletion tests/tt_metal/tt_metal/test_eltwise_binary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ int main(int argc, char** argv) {
tt_metal::Device* device = tt_metal::CreateDevice(device_id);


CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
CommandQueue& cq = detail::GetCommandQueue(device);

Program programs[] = {tt_metal::CreateProgram(), tt_metal::CreateProgram(), tt_metal::CreateProgram()};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ int main(int argc, char **argv) {
per_core_N);


CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);

////////////////////////////////////////////////////////////////////////////
// Execute Application
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void test_enqueue_program(std::function<tt_metal::Program(tt_metal::Device *devi

vector<uint32_t> out_vec;
{
CommandQueue& cq = *tt::tt_metal::detail::GLOBAL_CQ;
CommandQueue& cq = tt::tt_metal::detail::GetCommandQueue(device);

// Enqueue program inputs
Buffer buf(device, NUM_TILES * 2048, 2048, BufferType::DRAM);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,246 +135,7 @@ void create_CBs_for_fused_matmul(
auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
}
}
struct SingleCoreMatmulConfig {
bool activations_rm = false;
bool outputs_rm = false;
size_t M = 0;
size_t K = 0;
size_t N = 0;
size_t out_subblock_h = 0;
size_t out_subblock_w = 0;
size_t in0_block_w = 0;
size_t input0_dram_channel = 0;
size_t input1_dram_channel = 0;
size_t output_dram_channel = 0;
CoreCoord core = {};
};

bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig& cfg) {
// Since running in slow dispatch mode
tt::tt_metal::detail::GLOBAL_CQ.reset();
bool pass = true;
////////////////////////////////////////////////////////////////////////////
// Application Setup
////////////////////////////////////////////////////////////////////////////
tt_metal::Program program = tt_metal::CreateProgram();

uint32_t single_tile_size = 2 * 1024;
TT_FATAL(
cfg.M * cfg.in0_block_w * single_tile_size * 2 <= 150 * 1024, "input0 block must fit within 150kB of L1");
TT_FATAL(
cfg.N * cfg.in0_block_w * single_tile_size * 2 <= 100 * 1024, "input1 block must fit within 100kB of L1");
TT_FATAL(cfg.M * cfg.N * single_tile_size <= 600 * 1024, "output block must fit within 600kB of L1");
uint32_t dram_buffer_size_input0 =
single_tile_size * cfg.M * cfg.K; // num_tiles of FP16_B, hard-coded in the reader/writer kernels
uint32_t dram_buffer_size_input1 =
single_tile_size * cfg.K * cfg.N; // num_tiles of FP16_B, hard-coded in the reader/writer kernels
uint32_t dram_buffer_size_output =
single_tile_size * cfg.M * cfg.N; // num_tiles of FP16_B, hard-coded in the reader/writer kernels

auto input0_dram_buffer = CreateBuffer(
device,
dram_buffer_size_input0,
dram_buffer_size_input0,
tt_metal::BufferType::DRAM);
uint32_t input0_dram_byte_address = input0_dram_buffer.address();
auto input1_dram_buffer = CreateBuffer(
device,
dram_buffer_size_input1,
dram_buffer_size_input1,
tt_metal::BufferType::DRAM);
uint32_t input1_dram_byte_address = input1_dram_buffer.address();
auto output_dram_buffer = CreateBuffer(
device,
dram_buffer_size_output,
dram_buffer_size_output,
tt_metal::BufferType::DRAM);
uint32_t output_dram_byte_address = output_dram_buffer.address();

auto input0_dram_noc_xy = input0_dram_buffer.noc_coordinates();
auto input1_dram_noc_xy = input1_dram_buffer.noc_coordinates();
auto output_dram_noc_xy = output_dram_buffer.noc_coordinates();

std::vector<uint32_t> reader_rt_args{
(std::uint32_t)input0_dram_byte_address,
(std::uint32_t)input0_dram_noc_xy.x,
(std::uint32_t)input0_dram_noc_xy.y,
(std::uint32_t)input1_dram_byte_address,
(std::uint32_t)input1_dram_noc_xy.x,
(std::uint32_t)input1_dram_noc_xy.y,
(std::uint32_t)(cfg.K / cfg.in0_block_w), // num_blocks
(std::uint32_t)(cfg.M * cfg.in0_block_w), // input 0 block num tiles
(std::uint32_t)(cfg.N * cfg.in0_block_w), // input 1 block num tiles
(std::uint32_t)(cfg.M * cfg.in0_block_w * single_tile_size), // input 0 block bytes
(std::uint32_t)(cfg.N * cfg.in0_block_w * single_tile_size)}; // input 1 block bytes
std::vector<uint32_t> writer_rt_args;
string writer_kernel_name;
if (cfg.outputs_rm) {
writer_kernel_name = "tt_metal/kernels/dataflow/writer_unary.cpp";
writer_rt_args = {
(std::uint32_t)output_dram_byte_address,
(std::uint32_t)output_dram_noc_xy.x,
(std::uint32_t)output_dram_noc_xy.y,
uint(cfg.M * cfg.N)};
} else {
writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp";
writer_rt_args = {
(std::uint32_t)output_dram_byte_address,
(std::uint32_t)output_dram_noc_xy.x,
(std::uint32_t)output_dram_noc_xy.y,
(std::uint32_t)cfg.out_subblock_h, // num tiles per sub block m
(std::uint32_t)cfg.out_subblock_w, // num tiles per sub block n
(std::uint32_t)(cfg.M / cfg.out_subblock_h), // num sub blocks m
(std::uint32_t)(cfg.N / cfg.out_subblock_w), // num sub blocks n
(std::uint32_t)(
cfg.out_subblock_w * single_tile_size *
(cfg.N / cfg.out_subblock_w)), // bytes offset to next row within sub-block
(std::uint32_t)(
cfg.out_subblock_h * cfg.out_subblock_w * single_tile_size *
(cfg.N / cfg.out_subblock_w)), // bytes offset to next row of sub-blocks
(std::uint32_t)(cfg.out_subblock_w * single_tile_size)}; // bytes offset to next sub-block
}
auto writer_kernel = tt_metal::CreateKernel(
program,
writer_kernel_name,
cfg.core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});

auto reader_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
cfg.core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});

int num_blocks = (cfg.K / cfg.in0_block_w);
int in0_num_subblocks = (cfg.M / cfg.out_subblock_h);
int in0_block_num_tiles = cfg.out_subblock_h * cfg.in0_block_w * in0_num_subblocks;
int in0_subblock_num_tiles = cfg.out_subblock_h * cfg.in0_block_w;
int in1_num_subblocks = (cfg.N / cfg.out_subblock_w);
int in1_block_num_tiles = cfg.out_subblock_w * cfg.in0_block_w * in1_num_subblocks;
int in1_per_core_w = cfg.out_subblock_w * in1_num_subblocks;
int out_subblock_num_tiles = cfg.out_subblock_h * cfg.out_subblock_w;
int in0_subblock_h = (in0_block_num_tiles / in0_num_subblocks) / cfg.in0_block_w;

create_CBs_for_fused_matmul(
program,
device,
cfg.core,
cfg.activations_rm,
cfg.outputs_rm,
cfg.M,
cfg.N,
cfg.in0_block_w,
cfg.out_subblock_h);

TT_FATAL(
in0_subblock_h * cfg.in0_block_w * in0_num_subblocks == in0_block_num_tiles,
"in0_subblock_h * cfg.in0_block_w * in0_num_subblocks == in0_block_num_tiles");
TT_FATAL(cfg.in0_block_w == cfg.K, "Must match k tiles");

vector<uint32_t> compute_kernel_args = {
uint(cfg.in0_block_w),
uint(in0_num_subblocks),
uint(in0_block_num_tiles),
uint(in0_subblock_num_tiles),
uint(in0_subblock_h),

uint(in1_num_subblocks),
uint(in1_block_num_tiles),
uint(in1_per_core_w),

uint(num_blocks),

uint(cfg.out_subblock_h),
uint(cfg.out_subblock_w),
uint(out_subblock_num_tiles),

uint(cfg.activations_rm),
uint(cfg.outputs_rm)};

auto matmul_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp",
cfg.core,
tt_metal::ComputeConfig{.compile_args = compute_kernel_args});

////////////////////////////////////////////////////////////////////////////
// Stimulus Generation
////////////////////////////////////////////////////////////////////////////
std::vector<uint32_t> packed_identity = {};
std::vector<uint32_t> packed_activation = {};
auto activation = generate_uniform_random_vector<bfloat16>(
1.0f,
1.0f,
dram_buffer_size_input0 / bfloat16::SIZEOF,
std::chrono::system_clock::now().time_since_epoch().count());
if (cfg.activations_rm) {
packed_activation = pack_vector<uint32_t, bfloat16>(activation);
} else {
auto activations_tilized = tilize<bfloat16, 32, 32>(activation, cfg.M * 32, cfg.K * 32);
auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
packed_activation = pack_vector<uint32_t, bfloat16>(activations_tile_layout);
}
auto identity =
generate_strided_vector<bfloat16>(0.0f, 1.0f, cfg.N * 32 + 1, 0, dram_buffer_size_input1 / bfloat16::SIZEOF);
auto identity_tilized = tilize<bfloat16, 32, 32>(identity, cfg.K * 32, cfg.N * 32);
auto identity_tile_layout = convert_to_tile_layout(identity_tilized);
packed_identity = pack_vector<uint32_t, bfloat16>(identity_tile_layout);
////////////////////////////////////////////////////////////////////////////
// Golden Generation
////////////////////////////////////////////////////////////////////////////
auto packed_golden = packed_activation; //

////////////////////////////////////////////////////////////////////////////
// Compile and Execute Application
////////////////////////////////////////////////////////////////////////////

tt_metal::detail::WriteToBuffer(input0_dram_buffer, packed_activation);
tt_metal::detail::WriteToBuffer(input1_dram_buffer, packed_identity);
std::vector<uint32_t> input0_dram_readback_packed;
tt_metal::detail::ReadFromBuffer(input0_dram_buffer, input0_dram_readback_packed);
EXPECT_TRUE(input0_dram_readback_packed == packed_activation);
print_vector_fixed_numel_per_row(unpack_vector<bfloat16, uint32_t>(input0_dram_readback_packed), 32);
std::vector<uint32_t> input1_dram_readback_packed;
tt_metal::detail::ReadFromBuffer(input1_dram_buffer, input1_dram_readback_packed);
EXPECT_TRUE(input1_dram_readback_packed == packed_identity);
print_vector_fixed_numel_per_row(unpack_vector<bfloat16, uint32_t>(input1_dram_readback_packed), 32);


tt_metal::SetRuntimeArgs(program, reader_kernel, cfg.core, reader_rt_args);
tt_metal::SetRuntimeArgs(program, writer_kernel, cfg.core, writer_rt_args);


tt_metal::detail::LaunchProgram(device, program);

////////////////////////////////////////////////////////////////////////////
// Comparison Checking
////////////////////////////////////////////////////////////////////////////
std::vector<uint32_t> input0_l1_readback_packed;
tt_metal::detail::ReadFromDeviceL1(device, cfg.core, 120 * 1024, 2 * single_tile_size, input0_l1_readback_packed);
EXPECT_TRUE(input0_l1_readback_packed == packed_activation);
std::vector<uint32_t> input1_l1_readback_packed;
tt_metal::detail::ReadFromDeviceL1(device, cfg.core, 250 * 1024, 2 * single_tile_size, input1_l1_readback_packed);
EXPECT_TRUE(input1_l1_readback_packed == packed_identity);

// std::vector<uint32_t> input1_l1_readback_packed;
// std::vector<uint32_t> output_l1_readback_packed;
std::vector<uint32_t> dest_buffer_data;
tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_buffer_data);
auto dest_buffer_data_unpacked = unpack_vector<bfloat16, uint32_t>(dest_buffer_data);
if (not cfg.outputs_rm) {
dest_buffer_data_unpacked = convert_to_flat_layout(dest_buffer_data_unpacked);
dest_buffer_data_unpacked = untilize<bfloat16, 32, 32>(dest_buffer_data_unpacked, cfg.M * 32, cfg.N * 32);
}
pass &=
is_close_vectors<bfloat16>(activation, dest_buffer_data_unpacked, [&](const bfloat16& a, const bfloat16& b) {
return is_close(a, b, 0.15f);
});

return pass;
}
bool single_tile_matmul(tt_metal::Device* device) {

bool pass = true;
Expand Down
Loading

0 comments on commit 8914d1c

Please sign in to comment.