Skip to content

Commit

Permalink
#4141: Add GetPreferredNOCForDRAMRead, GetPreferredNOCForDRAMWrite an…
Browse files Browse the repository at this point in the history
…d update all ops to use these apis
  • Loading branch information
tt-aho committed Dec 6, 2023
1 parent f310f9f commit 147f5ac
Show file tree
Hide file tree
Showing 69 changed files with 327 additions and 358 deletions.
16 changes: 8 additions & 8 deletions tests/tt_eager/python_api_testing/unit_testing/test_bert_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ def test_bert_linear(
# out_subblock_w = 4
# out_subblock_h = 4

print("in0 block w h " + str(in0_block_w * 32) + " " + str(in0_block_h * 32))
print("in1 block w h " + str(out_block_w * 32) + " " + str(in0_block_w * 32))
print("out block w h " + str(out_block_w * 32) + " " + str(out_block_h * 32))
print("out subblock w h " + str(out_subblock_w * 32) + " " + str(out_subblock_h * 32))
logger.debug("in0 block w h " + str(in0_block_w * 32) + " " + str(in0_block_h * 32))
logger.debug("in1 block w h " + str(out_block_w * 32) + " " + str(in0_block_w * 32))
logger.debug("out block w h " + str(out_block_w * 32) + " " + str(out_block_h * 32))
logger.debug("out subblock w h " + str(out_subblock_w * 32) + " " + str(out_subblock_h * 32))

interleaved_mem_config_L1 = ttl.tensor.MemoryConfig(
memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
Expand Down Expand Up @@ -297,10 +297,10 @@ def test_bert_linear_batch7(
while out_block_w % out_subblock_w != 0:
out_subblock_w = out_block_w // 2

print("in0 block w h " + str(in0_block_w * 32) + " " + str(in0_block_h * 32))
print("in1 block w h " + str(out_block_w * 32) + " " + str(in0_block_w * 32))
print("out block w h " + str(out_block_w * 32) + " " + str(out_block_h * 32))
print("out subblock w h " + str(out_subblock_w * 32) + " " + str(out_subblock_h * 32))
logger.debug("in0 block w h " + str(in0_block_w * 32) + " " + str(in0_block_h * 32))
logger.debug("in1 block w h " + str(out_block_w * 32) + " " + str(in0_block_w * 32))
logger.debug("out block w h " + str(out_block_w * 32) + " " + str(out_block_h * 32))
logger.debug("out subblock w h " + str(out_subblock_w * 32) + " " + str(out_subblock_h * 32))

interleaved_mem_config_L1 = ttl.tensor.MemoryConfig(
memory_layout=ttl.tensor.TensorMemoryLayout.INTERLEAVED,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,13 @@ operation::ProgramWithCallbacks bcast_multi_core_h(const Tensor &a, const Tensor
program,
reader_name,
all_device_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_compile_time_args});
tt_metal::ReaderDataMovementConfig{.compile_args = reader_compile_time_args});

KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/kernels/dataflow/writer_unary_8bank_input_cols_batched.cpp",
all_device_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
tt_metal::WriterDataMovementConfig{.compile_args = writer_compile_time_args});

std::map<std::string, std::string> bcast_defines = bcast_op_utils::get_defines(bcast_dim, bcast_math);
auto bcast_kernel_id = tt_metal::CreateKernel(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tenso
program,
reader_name,
all_device_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_compile_time_args, .defines = reader_defines});
tt_metal::ReaderDataMovementConfig{.compile_args = reader_compile_time_args, .defines = reader_defines});

KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
all_device_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
tt_metal::WriterDataMovementConfig{.compile_args = writer_compile_time_args});

auto bcast_kernel_id = tt_metal::CreateKernel(
program,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ operation::ProgramWithCallbacks bcast_multi_core_w(const Tensor &a, const Tensor
program,
reader_name,
all_device_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_compile_time_args});
tt_metal::ReaderDataMovementConfig{.compile_args = reader_compile_time_args});

KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/kernels/dataflow/writer_unary_8bank_input_cols_batched.cpp",
all_device_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
tt_metal::WriterDataMovementConfig{.compile_args = writer_compile_time_args});

std::map<std::string, std::string> bcast_defines = bcast_op_utils::get_defines(bcast_dim, bcast_math);
auto bcast_kernel_id = tt_metal::CreateKernel(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ operation::ProgramWithCallbacks bcast_single_core(const Tensor &a, const Tensor
program,
reader_name,
core,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_compile_time_args});
tt_metal::ReaderDataMovementConfig{.compile_args = reader_compile_time_args});

KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
core,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
tt_metal::WriterDataMovementConfig{.compile_args = writer_compile_time_args});

const char* compute_name = bcast_op_utils::get_compute_name(bcast_dim);
std::map<std::string, std::string> bcast_defines = bcast_op_utils::get_defines(bcast_dim, bcast_math);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ operation::ProgramWithCallbacks matmul_multi_core(const Tensor &a, const Tensor
program,
"tt_eager/tt_dnn/kernels/dataflow/reader_bmm_8bank_output_tiles_partitioned.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = reader_compile_time_args});
tt_metal::ReaderDataMovementConfig{.compile_args = reader_compile_time_args});

auto writer_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/kernels/dataflow/writer_unary_interleaved_start_id.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
tt_metal::WriterDataMovementConfig{.compile_args = writer_compile_time_args});

vector<uint32_t> compute_args_group_1 = {
1, // B
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,13 @@ tt_metal::operation::ProgramWithCallbacks create_program(
program,
"tt_eager/tt_dnn/kernels/dataflow/reader_bmm_tile_layout.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_compile_time_args});
tt_metal::ReaderDataMovementConfig{.compile_args = reader_compile_time_args});

auto unary_writer_kernel_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/kernels/dataflow/writer_bmm_tile_layout.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
tt_metal::WriterDataMovementConfig{.compile_args = writer_compile_time_args});

// Create compute kernel
auto mm_kernel_id = tt_metal::CreateKernel(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,13 @@ tt_metal::operation::ProgramWithCallbacks create_program(
program,
"tt_eager/tt_dnn/kernels/dataflow/reader_bmm_tile_layout.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_compile_time_args});
tt_metal::ReaderDataMovementConfig{.compile_args = reader_compile_time_args});

auto unary_writer_kernel_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/kernels/dataflow/writer_bmm_tile_layout.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
tt_metal::WriterDataMovementConfig{.compile_args = writer_compile_time_args});

// Create compute kernel
auto mm_kernel_id = tt_metal::CreateKernel(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "tt_metal/common/constants.hpp"
#include "hostdevcommon/common_values.hpp"
#include "tt_metal/detail/util.hpp"
#include "tt_metal/detail/tt_metal.hpp"

using namespace tt::constants;
using namespace tt;
Expand Down Expand Up @@ -247,27 +248,31 @@ operation::ProgramWithCallbacks create_program_mcast_in0(
}

mm_kernel_in1_sender_writer_defines["SKIP_MCAST"] = "1";

// in1 is the reader of weights/output writer, and we choose to make it use the optimized reader noc
tt_metal::NOC in0_noc = detail::GetPreferredNOCForDRAMWrite(device->arch());
tt_metal::NOC in1_noc = detail::GetPreferredNOCForDRAMRead(device->arch());

auto mm_kernel_in0_sender_id = tt_metal::CreateKernel(
program,
in0_is_sharded ? "tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp" : "tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp",
mcast_sender,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = in0_sender_compile_time_args});
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = in0_noc, .compile_args = in0_sender_compile_time_args});

auto mm_kernel_in1_sender_writer_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = in1_sender_writer_compile_time_args, .defines = mm_kernel_in1_sender_writer_defines});
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = in1_noc, .compile_args = in1_sender_writer_compile_time_args, .defines = mm_kernel_in1_sender_writer_defines});


KernelHandle mm_kernel_in0_receiver_id = 0;
if (!in0_is_sharded) {
mm_kernel_in0_receiver_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp",
/* in0_receiver_in1_sender, // If not using half-half noc setup */
mcast_receivers,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = in0_receiver_compile_time_args});
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = in0_noc, .compile_args = in0_receiver_compile_time_args});
}
// Compute kernel compile time args

Expand Down Expand Up @@ -374,8 +379,11 @@ operation::ProgramWithCallbacks create_program_mcast_in0(
in0_mcast_noc_y.push_back(device->worker_core_from_logical_core({0, core_idx_y}).y);
}
}
CoreCoord start_core_noc = device->worker_core_from_logical_core({start_core_x, start_core_y});
CoreCoord end_core_noc = device->worker_core_from_logical_core({start_core_x + num_cores_c - 1, start_core_y + num_cores_r - 1});
CoreCoord start_core_noc = top_left_core_physical;
CoreCoord end_core_noc = bottom_right_core_physical;
if (in0_noc == NOC::NOC_1) {
std::swap(start_core_noc, end_core_noc);
}

for(uint32_t i = 0; i < num_cores; i++) {
uint32_t core_idx_x = i % num_cores_c;
Expand Down Expand Up @@ -404,10 +412,10 @@ operation::ProgramWithCallbacks create_program_mcast_in0(
(std::uint32_t) in0_buffer->address(),
(std::uint32_t) K * per_core_M * output_idx_y, // in0_tensor_start_tile_id
// in0 mcast args
(std::uint32_t) top_left_core_physical.x, // in0_mcast_dest_noc_start_x
(std::uint32_t) top_left_core_physical.y, // in0_mcast_dest_noc_start_y
(std::uint32_t) bottom_right_core_physical.x, // in0_mcast_dest_noc_end_x
(std::uint32_t) bottom_right_core_physical.y, // in0_mcast_dest_noc_end_y
(std::uint32_t) start_core_noc.x, // in0_mcast_dest_noc_start_x
(std::uint32_t) start_core_noc.y, // in0_mcast_dest_noc_start_y
(std::uint32_t) end_core_noc.x, // in0_mcast_dest_noc_end_x
(std::uint32_t) end_core_noc.y, // in0_mcast_dest_noc_end_y

// padding args
(std::uint32_t) per_core_M // last_block_h
Expand Down Expand Up @@ -767,24 +775,29 @@ operation::ProgramWithCallbacks create_program_mcast_in1(
}

mm_kernel_in0_sender_defines["SKIP_MCAST"] = "1";

// in1 is the reader of weights/output writer, and we choose to make it use the optimized reader noc
tt_metal::NOC in0_noc = detail::GetPreferredNOCForDRAMWrite(device->arch());
tt_metal::NOC in1_noc = detail::GetPreferredNOCForDRAMRead(device->arch());

auto mm_kernel_in0_sender_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp",
all_cores,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = in0_sender_compile_time_args, .defines = mm_kernel_in0_sender_defines});
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = in0_noc, .compile_args = in0_sender_compile_time_args, .defines = mm_kernel_in0_sender_defines});

auto mm_kernel_in1_sender_writer_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp",
mcast_sender,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = in1_sender_writer_compile_time_args, .defines = mm_kernel_in1_sender_writer_defines});
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = in1_noc, .compile_args = in1_sender_writer_compile_time_args, .defines = mm_kernel_in1_sender_writer_defines});


auto mm_kernel_in1_receiver_writer_id = tt_metal::CreateKernel(
program,
"tt_eager/tt_dnn/op_library/bmm/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp",
mcast_receivers,
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = in1_receiver_writer_compile_time_args, .defines = mm_kernel_in1_receiver_writer_defines});
tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = in1_noc, .compile_args = in1_receiver_writer_compile_time_args, .defines = mm_kernel_in1_receiver_writer_defines});

// Compute kernel compile time args

Expand Down Expand Up @@ -875,6 +888,13 @@ operation::ProgramWithCallbacks create_program_mcast_in1(

std::vector<KernelHandle> reader_kernel_ids;
std::vector<KernelHandle> writer_kernel_ids;

CoreCoord start_core_noc = bottom_right_core_physical;
CoreCoord end_core_noc = top_left_core_physical;
if (in1_noc == NOC::NOC_0) {
std::swap(start_core_noc, end_core_noc);
}

for(uint32_t i = 0; i < num_cores; i++) {
uint32_t core_idx_x = i % num_cores_c;
uint32_t core_idx_y = i / num_cores_c;
Expand All @@ -890,10 +910,10 @@ operation::ProgramWithCallbacks create_program_mcast_in1(
(std::uint32_t) in1_buffer->address(),
(std::uint32_t) per_core_N * output_idx_x, //in1_tensor_start_tile_id
// in1 mcast args
(std::uint32_t) bottom_right_core_physical.x, // in1_mcast_dest_noc_start_x
(std::uint32_t) bottom_right_core_physical.y, // in1_mcast_dest_noc_start_y
(std::uint32_t) top_left_core_physical.x, // in1_mcast_dest_noc_end_x
(std::uint32_t) top_left_core_physical.y, // in1_mcast_dest_noc_end_y
(std::uint32_t) start_core_noc.x, // in1_mcast_dest_noc_start_x
(std::uint32_t) start_core_noc.y, // in1_mcast_dest_noc_start_y
(std::uint32_t) end_core_noc.x, // in1_mcast_dest_noc_end_x
(std::uint32_t) end_core_noc.y, // in1_mcast_dest_noc_end_y

// WRITER
// out tensor args
Expand Down
Loading

0 comments on commit 147f5ac

Please sign in to comment.