diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/common/work_split.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/common/work_split.hpp index 1a642387b00..8b4884c3e4b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/common/work_split.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/common/work_split.hpp @@ -19,59 +19,6 @@ namespace tt { namespace tt_metal { -// splits the tiles evenly between num_cores, -// with option of padding where necessary -struct TilesSplit { - int num_cores_; - int total_tiles_; - int tpc_; // unclipped tiles per core - - inline TilesSplit(int num_cores, int total_tiles) : num_cores_(num_cores), total_tiles_(total_tiles) { - tpc_ = div_up(total_tiles_, num_cores_); - } - - // number of tiles per core for div_up split - inline uint32_t get_tpc() const { return tpc_; } - - // number of tiles per core for close to even split with multiples of 8 going - // to each core - inline uint32_t get_clipped_tpc(int icore) const { - auto result = (tpc_ * (icore + 1) > total_tiles_) ? (total_tiles_ - tpc_ * (icore + 1)) : tpc_; - return result; - } -}; - -struct CoreGridDesc { - uint32_t x_, y_; - CoreGridDesc(Device *dev) { - auto gs = dev->compute_with_storage_grid_size(); - x_ = gs.x; - y_ = gs.y; - TT_ASSERT(x_ > 0 && y_ > 0); - } - uint32_t total_cores() const { return x_ * y_; } - CoreCoord wrap_core(int icore) const { - TT_ASSERT(icore < total_cores()); - CoreCoord core = {(std::size_t)icore % x_, (std::size_t)icore / x_}; - return core; - } - - int numcores_dividing_numtiles(int num_tiles, int block_size = 1) { - // since we will be splitting num_tiles into num_cores we need to find - // num_cores such that num_tiles % num_cores = 0, so that it's evenly - // divided since we don't support leftovers at the moment - // TODO(AP): optimize if needed, O(max_cores) atm - uint32_t max_cores = total_cores(); - TT_ASSERT(max_cores % block_size == 0 || max_cores == 1); - if (max_cores > num_tiles) - max_cores = num_tiles; - for (int j = max_cores; j >= 1; j--) - if (num_tiles % j == 0) - return j; - return 1; - } -}; - // Given a number of tiles and number of cores available // Set the largest number of cores less than the number of tiles // Returns the number of cores as well as the number of tiles per core diff --git a/tt_eager/tt_dnn/op_library/moreh_adam/moreh_adam.cpp b/tt_eager/tt_dnn/op_library/moreh_adam/moreh_adam.cpp index 2feeb064f42..a0065d2437b 100644 --- a/tt_eager/tt_dnn/op_library/moreh_adam/moreh_adam.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_adam/moreh_adam.cpp @@ -37,15 +37,13 @@ operation::ProgramWithCallbacks moreh_adam_( // Device Setup //////////////////////////////////////////////////////////////////////////// tt_metal::Device *device = param.device(); - - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; // auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); // uint32_t num_cores_x = compute_with_storage_grid_size.x; // uint32_t num_cores_y = compute_with_storage_grid_size.y; - auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_tiles); + auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_adamw/moreh_adamw.cpp b/tt_eager/tt_dnn/op_library/moreh_adamw/moreh_adamw.cpp index c17cc662a59..3f0d618224c 100644 --- a/tt_eager/tt_dnn/op_library/moreh_adamw/moreh_adamw.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_adamw/moreh_adamw.cpp @@ -37,15 +37,13 @@ operation::ProgramWithCallbacks moreh_adamw_( // Device Setup //////////////////////////////////////////////////////////////////////////// tt_metal::Device *device = param.device(); - - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; // auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); // uint32_t num_cores_x = compute_with_storage_grid_size.x; // uint32_t num_cores_y = compute_with_storage_grid_size.y; - auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_tiles); + auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/moreh_clip_grad_norm_step1.cpp b/tt_eager/tt_dnn/op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/moreh_clip_grad_norm_step1.cpp index 6b2a5608c7d..50c5c6f4182 100644 --- a/tt_eager/tt_dnn/op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/moreh_clip_grad_norm_step1.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/moreh_clip_grad_norm_step1.cpp @@ -52,16 +52,15 @@ operation::ProgramWithCallbacks moreh_clip_grad_norm_step1_impl( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, all_cores, core_group_1, core_group_2, num_inputs_per_core_group_1, - num_inputs_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_inputs); + num_inputs_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_inputs); TT_ASSERT(core_group_2.ranges().empty()); TT_ASSERT(num_inputs_per_core_group_1 == 1); TT_ASSERT(num_inputs_per_core_group_2 == 0); diff --git a/tt_eager/tt_dnn/op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/moreh_clip_grad_norm_step3.cpp b/tt_eager/tt_dnn/op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/moreh_clip_grad_norm_step3.cpp index e2298da64a5..1660991cb9e 100644 --- a/tt_eager/tt_dnn/op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/moreh_clip_grad_norm_step3.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/moreh_clip_grad_norm_step3.cpp @@ -38,16 +38,16 @@ operation::ProgramWithCallbacks moreh_clip_grad_norm_step3_impl( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; + const auto [num_cores_to_be_used, all_cores, core_group_1, core_group_2, num_inputs_per_core_group_1, - num_inputs_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_inputs); + num_inputs_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_inputs); TT_ASSERT(core_group_2.ranges().empty()); TT_ASSERT(num_inputs_per_core_group_1 == 1); TT_ASSERT(num_inputs_per_core_group_2 == 0); diff --git a/tt_eager/tt_dnn/op_library/moreh_cumsum/moreh_cumsum_nc/moreh_cumsum_nc.cpp b/tt_eager/tt_dnn/op_library/moreh_cumsum/moreh_cumsum_nc/moreh_cumsum_nc.cpp index 111e53bd6dc..0b7f99c5958 100644 --- a/tt_eager/tt_dnn/op_library/moreh_cumsum/moreh_cumsum_nc/moreh_cumsum_nc.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_cumsum/moreh_cumsum_nc/moreh_cumsum_nc.cpp @@ -57,9 +57,8 @@ operation::ProgramWithCallbacks moreh_cumsum_nc( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const uint32_t in0_t = 2; // input const uint32_t in1_t = 1; // zero @@ -71,7 +70,7 @@ operation::ProgramWithCallbacks moreh_cumsum_nc( core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_tiles_per_chip); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_tiles_per_chip); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_groupnorm/moreh_groupnorm.cpp b/tt_eager/tt_dnn/op_library/moreh_groupnorm/moreh_groupnorm.cpp index be86f98b6dc..1fd817e5e33 100644 --- a/tt_eager/tt_dnn/op_library/moreh_groupnorm/moreh_groupnorm.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_groupnorm/moreh_groupnorm.cpp @@ -101,9 +101,8 @@ operation::ProgramWithCallbacks moreh_groupnorm_impl( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -111,7 +110,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_impl( core_group_1, core_group_2, num_rows_per_core_group_1, - num_rows_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_rows); + num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_rows); log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str()); log_debug(LogTest, fmt::format("num_rows_per_core_group_1: {}", num_rows_per_core_group_1).c_str()); diff --git a/tt_eager/tt_dnn/op_library/moreh_groupnorm_backward/gamma_beta_grad/moreh_groupnorm_backward_gamma_beta_grad.cpp b/tt_eager/tt_dnn/op_library/moreh_groupnorm_backward/gamma_beta_grad/moreh_groupnorm_backward_gamma_beta_grad.cpp index 44cf94e8d90..29b635e1e17 100644 --- a/tt_eager/tt_dnn/op_library/moreh_groupnorm_backward/gamma_beta_grad/moreh_groupnorm_backward_gamma_beta_grad.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_groupnorm_backward/gamma_beta_grad/moreh_groupnorm_backward_gamma_beta_grad.cpp @@ -75,9 +75,8 @@ operation::ProgramWithCallbacks moreh_groupnorm_backward_gamma_beta_grad_impl( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -85,7 +84,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_backward_gamma_beta_grad_impl( core_group_1, core_group_2, num_channels_per_core_group_1, - num_channels_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_channels); + num_channels_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_channels); log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str()); log_debug(LogTest, fmt::format("num_channels_per_core_group_1: {}", num_channels_per_core_group_1).c_str()); diff --git a/tt_eager/tt_dnn/op_library/moreh_groupnorm_backward/input_grad/moreh_groupnorm_backward_input_grad.cpp b/tt_eager/tt_dnn/op_library/moreh_groupnorm_backward/input_grad/moreh_groupnorm_backward_input_grad.cpp index b990eedff0e..cb3bcb684b9 100644 --- a/tt_eager/tt_dnn/op_library/moreh_groupnorm_backward/input_grad/moreh_groupnorm_backward_input_grad.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_groupnorm_backward/input_grad/moreh_groupnorm_backward_input_grad.cpp @@ -71,9 +71,8 @@ operation::ProgramWithCallbacks moreh_groupnorm_backward_input_grad_impl( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -81,7 +80,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_backward_input_grad_impl( core_group_1, core_group_2, num_rows_per_core_group_1, - num_rows_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_rows); + num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_rows); log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str()); log_debug(LogTest, fmt::format("num_rows_per_core_group_1: {}", num_rows_per_core_group_1).c_str()); diff --git a/tt_eager/tt_dnn/op_library/moreh_layernorm/moreh_layernorm_op.cpp b/tt_eager/tt_dnn/op_library/moreh_layernorm/moreh_layernorm_op.cpp index 879db0daac2..ee715c83d32 100644 --- a/tt_eager/tt_dnn/op_library/moreh_layernorm/moreh_layernorm_op.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_layernorm/moreh_layernorm_op.cpp @@ -127,9 +127,8 @@ operation::ProgramWithCallbacks moreh_layernorm_impl( // Core Setup //////////////////////////////////////////////////////////////////////////// const auto NCHt = N * C * Ht; - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; // core_group_2 works more. // If number of working cores is 108 and NCHt is 110, @@ -140,7 +139,7 @@ operation::ProgramWithCallbacks moreh_layernorm_impl( core_group_1, core_group_2, num_rows_per_core_group_1, - num_rows_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, NCHt); + num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, NCHt); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_layernorm_backward/gamma_beta_grad/moreh_layernorm_backward_gamma_beta_grad.cpp b/tt_eager/tt_dnn/op_library/moreh_layernorm_backward/gamma_beta_grad/moreh_layernorm_backward_gamma_beta_grad.cpp index d25ec17a1ad..f0dd80b6cfc 100644 --- a/tt_eager/tt_dnn/op_library/moreh_layernorm_backward/gamma_beta_grad/moreh_layernorm_backward_gamma_beta_grad.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_layernorm_backward/gamma_beta_grad/moreh_layernorm_backward_gamma_beta_grad.cpp @@ -95,9 +95,8 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_gamma_beta_grad_impl( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -105,7 +104,7 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_gamma_beta_grad_impl( core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, Wt); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, Wt); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_layernorm_backward/input_grad/moreh_layernorm_backward_input_grad.cpp b/tt_eager/tt_dnn/op_library/moreh_layernorm_backward/input_grad/moreh_layernorm_backward_input_grad.cpp index ea191cb71b4..76431a3f537 100644 --- a/tt_eager/tt_dnn/op_library/moreh_layernorm_backward/input_grad/moreh_layernorm_backward_input_grad.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_layernorm_backward/input_grad/moreh_layernorm_backward_input_grad.cpp @@ -106,9 +106,8 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_input_grad_impl( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -116,7 +115,7 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_input_grad_impl( core_group_1, core_group_2, num_rows_per_core_group_1, - num_rows_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, NCHt); + num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, NCHt); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_linear_backward/bias_backward_h/moreh_bias_backward_multi_core_h.cpp b/tt_eager/tt_dnn/op_library/moreh_linear_backward/bias_backward_h/moreh_bias_backward_multi_core_h.cpp index f865b702d0f..99d29233657 100644 --- a/tt_eager/tt_dnn/op_library/moreh_linear_backward/bias_backward_h/moreh_bias_backward_multi_core_h.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_linear_backward/bias_backward_h/moreh_bias_backward_multi_core_h.cpp @@ -41,9 +41,8 @@ operation::ProgramWithCallbacks moreh_bias_backward_multi_core_h(const Tensor &o //////////////////////////////////////////////////////////////////////////// // This should allocate a DRAM buffer on the device Device *device = output_grad.device(); - CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -51,7 +50,7 @@ operation::ProgramWithCallbacks moreh_bias_backward_multi_core_h(const Tensor &o core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, Wt); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, Wt); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_matmul/multi_core/moreh_matmul_op_multi_core.cpp b/tt_eager/tt_dnn/op_library/moreh_matmul/multi_core/moreh_matmul_op_multi_core.cpp index 46890e860cb..c8e1f0c12fb 100644 --- a/tt_eager/tt_dnn/op_library/moreh_matmul/multi_core/moreh_matmul_op_multi_core.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_matmul/multi_core/moreh_matmul_op_multi_core.cpp @@ -174,16 +174,16 @@ operation::ProgramWithCallbacks moreh_matmul_multi_core( //////////////////////////////////////////////////////////////////////////// // Core Grid Configuration For Workload //////////////////////////////////////////////////////////////////////////// - CoreGridDesc core_grid(device); - const auto num_cores_y {core_grid.y_}; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; + const auto [num_cores, all_cores, core_group_1, core_group_2, num_output_tiles_per_core_group_1, - num_output_tiles_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_output_tiles); + num_output_tiles_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles); log_debug(LogOp, "{}:{} num_output_tiles: {}", __func__, __LINE__, num_output_tiles); log_debug(LogOp, "{}:{} num_output_tiles_per_core_group1: {}, 2: {} ", __func__, __LINE__, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2); diff --git a/tt_eager/tt_dnn/op_library/moreh_mean/moreh_mean_nc/moreh_mean_nc.cpp b/tt_eager/tt_dnn/op_library/moreh_mean/moreh_mean_nc/moreh_mean_nc.cpp index 9114b8a58ff..37882ee9255 100644 --- a/tt_eager/tt_dnn/op_library/moreh_mean/moreh_mean_nc/moreh_mean_nc.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_mean/moreh_mean_nc/moreh_mean_nc.cpp @@ -56,9 +56,8 @@ operation::ProgramWithCallbacks moreh_mean_nc(const Tensor &input, const Tensor //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const uint32_t in0_t = 2; // input const uint32_t in1_t = 1; // zero @@ -71,7 +70,7 @@ operation::ProgramWithCallbacks moreh_mean_nc(const Tensor &input, const Tensor core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_output_tiles); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_mean_backward/moreh_mean_backward.cpp b/tt_eager/tt_dnn/op_library/moreh_mean_backward/moreh_mean_backward.cpp index bb2e2b6a45d..fd230522221 100644 --- a/tt_eager/tt_dnn/op_library/moreh_mean_backward/moreh_mean_backward.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_mean_backward/moreh_mean_backward.cpp @@ -63,9 +63,8 @@ operation::ProgramWithCallbacks moreh_mean_backward_program(const Tensor &output //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const uint32_t in0_t = 2; // input const uint32_t in1_t = 1; // zero @@ -78,7 +77,7 @@ operation::ProgramWithCallbacks moreh_mean_backward_program(const Tensor &output core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_input_grad_tiles); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_input_grad_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_h/moreh_norm_h.cpp b/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_h/moreh_norm_h.cpp index 51a17f0cef1..5dd7bc29576 100644 --- a/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_h/moreh_norm_h.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_h/moreh_norm_h.cpp @@ -53,9 +53,8 @@ operation::ProgramWithCallbacks moreh_norm_h_impl(const Tensor &input, float p, //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -63,7 +62,7 @@ operation::ProgramWithCallbacks moreh_norm_h_impl(const Tensor &input, float p, core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, N * C * Wt); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, N * C * Wt); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_other/moreh_norm_other.cpp b/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_other/moreh_norm_other.cpp index 5d9b734fa80..9890250cc2d 100644 --- a/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_other/moreh_norm_other.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_other/moreh_norm_other.cpp @@ -66,9 +66,8 @@ operation::ProgramWithCallbacks moreh_norm_other_impl(const Tensor &input, float //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -76,7 +75,7 @@ operation::ProgramWithCallbacks moreh_norm_other_impl(const Tensor &input, float core_group_1, core_group_2, num_output_tiles_per_core_group_1, - num_output_tiles_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_output_tiles); + num_output_tiles_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_w/moreh_norm_w.cpp b/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_w/moreh_norm_w.cpp index c6c3e17ba98..869c05e2bb3 100644 --- a/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_w/moreh_norm_w.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_norm/moreh_norm_w/moreh_norm_w.cpp @@ -53,9 +53,8 @@ operation::ProgramWithCallbacks moreh_norm_w_impl(const Tensor &input, float p, //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -63,7 +62,7 @@ operation::ProgramWithCallbacks moreh_norm_w_impl(const Tensor &input, float p, core_group_1, core_group_2, num_rows_per_core_group_1, - num_rows_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, N * C * Ht); + num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, N * C * Ht); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_norm_backward/moreh_norm_backward.cpp b/tt_eager/tt_dnn/op_library/moreh_norm_backward/moreh_norm_backward.cpp index 33b9ebd164f..06316987348 100644 --- a/tt_eager/tt_dnn/op_library/moreh_norm_backward/moreh_norm_backward.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_norm_backward/moreh_norm_backward.cpp @@ -95,9 +95,8 @@ operation::ProgramWithCallbacks moreh_norm_backward_( //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - tt_metal::CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord(core_grid.x_, num_cores_y); + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const auto [num_cores_to_be_used, @@ -105,7 +104,7 @@ operation::ProgramWithCallbacks moreh_norm_backward_( core_group_1, core_group_2, num_input_tiles_per_core_group_1, - num_input_tiles_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_input_tiles); + num_input_tiles_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_input_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/moreh_sum_nc_impl.cpp b/tt_eager/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/moreh_sum_nc_impl.cpp index 9d93b52388a..47e2eab19ef 100644 --- a/tt_eager/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/moreh_sum_nc_impl.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/moreh_sum_nc_impl.cpp @@ -68,9 +68,8 @@ operation::ProgramWithCallbacks moreh_sum_nc_impl(const Tensor &input, const Ten //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const uint32_t in0_t = 2; // input const uint32_t in1_t = 1; // zero @@ -82,7 +81,7 @@ operation::ProgramWithCallbacks moreh_sum_nc_impl(const Tensor &input, const Ten core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_output_tiles); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/moreh_sum_backward/moreh_sum_backward_impl/moreh_sum_backward_impl.cpp b/tt_eager/tt_dnn/op_library/moreh_sum_backward/moreh_sum_backward_impl/moreh_sum_backward_impl.cpp index 352d5ab5972..933f5d15b0a 100644 --- a/tt_eager/tt_dnn/op_library/moreh_sum_backward/moreh_sum_backward_impl/moreh_sum_backward_impl.cpp +++ b/tt_eager/tt_dnn/op_library/moreh_sum_backward/moreh_sum_backward_impl/moreh_sum_backward_impl.cpp @@ -91,9 +91,8 @@ operation::ProgramWithCallbacks moreh_sum_backward_impl(const Tensor &output_gra //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const uint32_t in0_t = 2; // input const uint32_t in1_t = 1; // zero @@ -104,7 +103,7 @@ operation::ProgramWithCallbacks moreh_sum_backward_impl(const Tensor &output_gra core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_input_grad_tiles); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_input_grad_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp b/tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp index 88d499a8460..ba2c73a2590 100644 --- a/tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp +++ b/tt_eager/tt_dnn/op_library/prod/prod_nc/prod_nc.cpp @@ -55,9 +55,8 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor &input, const Tensor //////////////////////////////////////////////////////////////////////////// // Core Setup //////////////////////////////////////////////////////////////////////////// - CoreGridDesc core_grid(device); - const auto num_cores_y = core_grid.y_; - CoreCoord core_grid_coord = {core_grid.x_, num_cores_y}; + auto grid = device->compute_with_storage_grid_size(); + const auto num_cores_y = grid.y; const uint32_t in0_t = 2; // input const uint32_t in1_t = 1; // zero @@ -69,7 +68,7 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor &input, const Tensor core_group_1, core_group_2, num_cols_per_core_group_1, - num_cols_per_core_group_2] = tt_metal::split_work_to_cores(core_grid_coord, num_output_tiles); + num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles); //////////////////////////////////////////////////////////////////////////// // CircularBuffer Setup diff --git a/tt_eager/tt_dnn/op_library/work_split.hpp b/tt_eager/tt_dnn/op_library/work_split.hpp index a4e05aefa25..7b2a330c864 100644 --- a/tt_eager/tt_dnn/op_library/work_split.hpp +++ b/tt_eager/tt_dnn/op_library/work_split.hpp @@ -11,59 +11,11 @@ #include "tt_metal/common/assert.hpp" #include "tt_metal/common/core_coord.h" #include "tt_metal/common/math.hpp" - #include "tt_metal/host_api.hpp" - namespace tt { namespace tt_metal { -// splits the tiles evenly between num_cores, -// with option of padding where necessary -struct TilesSplit { - int num_cores_; - int total_tiles_; - int tpc_; // unclipped tiles per core - - inline TilesSplit(int num_cores, int total_tiles) : num_cores_(num_cores), total_tiles_(total_tiles) { - tpc_ = div_up(total_tiles_, num_cores_); - } - - // number of tiles per core for div_up split - inline uint32_t get_tpc() const { return tpc_; } - - // number of tiles per core for close to even split with multiples of 8 going to each core - inline uint32_t get_clipped_tpc(int icore) const { - auto result = ( tpc_*(icore+1) > total_tiles_ ) ? ( total_tiles_ - tpc_*(icore+1) ) : tpc_; - return result; - } -}; - -struct CoreGridDesc { - uint32_t x_, y_; - CoreGridDesc(Device* dev) { auto gs = dev->compute_with_storage_grid_size(); x_ = gs.x; y_ = gs.y; TT_ASSERT(x_ > 0 && y_ > 0); } - uint32_t total_cores() const { return x_*y_; } - CoreCoord wrap_core(int icore) const { - TT_ASSERT(icore < total_cores()); - CoreCoord core = {(std::size_t) icore % x_, (std::size_t) icore / x_}; - return core; - } - - int numcores_dividing_numtiles(int num_tiles, int block_size = 1) const { - // since we will be splitting num_tiles into num_cores we need to find num_cores such that - // num_tiles % num_cores = 0, so that it's evenly divided since we don't support leftovers at the moment - // TODO(AP): optimize if needed, O(max_cores) atm - uint32_t max_cores = total_cores(); - TT_ASSERT(max_cores % block_size == 0 || max_cores == 1); - if (max_cores > num_tiles) - max_cores = num_tiles; - for (int j = max_cores; j >= 1; j--) - if (num_tiles % j == 0) - return j; - return 1; - } -}; - // Given a number of tiles and number of cores available // Set the largest number of cores less than the number of tiles // Returns the number of cores as well as the number of tiles per core @@ -99,7 +51,7 @@ inline int find_max_divisor(uint32_t val, uint32_t start_max_div) { return result; } -inline int find_max_block_size(uint32_t val, uint32_t max_block_size=8) { +inline int find_max_block_size(uint32_t val, uint32_t max_block_size = 8) { int result = 1; for (int find_divisor = max_block_size; find_divisor >= 1; find_divisor--) { if (val % find_divisor == 0) { @@ -110,8 +62,12 @@ inline int find_max_block_size(uint32_t val, uint32_t max_block_size=8) { return result; } -inline std::set num_cores_to_corerange_set(const CoreCoord start_core, const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) { - uint32_t num_cores_x = grid_size.x; +inline std::set num_cores_to_corerange_set( + const CoreCoord start_core, + const uint32_t target_num_cores, + const CoreCoord grid_size, + const bool row_wise = false) { + uint32_t num_cores_x = grid_size.x; uint32_t num_cores_y = grid_size.y; uint32_t total_available_cores = 0; TT_FATAL(start_core.x < num_cores_x && start_core.y < num_cores_y, "Start core must be within grid size"); @@ -126,8 +82,12 @@ inline std::set num_cores_to_corerange_set(const CoreCoord start_core // Partial Cols total_available_cores += num_cores_y - start_core.y; } - TT_FATAL(target_num_cores <= total_available_cores, "Target number of cores {} is greater than total number of available cores {}", target_num_cores, total_available_cores); - std::set all_cores_set; + TT_FATAL( + target_num_cores <= total_available_cores, + "Target number of cores {} is greater than total number of available cores {}", + target_num_cores, + total_available_cores); + std::set all_cores_set; uint32_t leftover_size = target_num_cores; CoreCoord s_core = start_core; if (row_wise) { @@ -173,17 +133,19 @@ inline std::set num_cores_to_corerange_set(const CoreCoord start_core all_cores_set.insert(leftover_block); } } - return all_cores_set; + return all_cores_set; } // TODO: Get rid of old function -inline std::set num_cores_to_corerange_set(const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) { - return num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise); +inline std::set num_cores_to_corerange_set( + const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) { + return num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise); } // TODO: Switch num_cores_to_corerange_set to always return CoreRangeSet -inline CoreRangeSet num_cores_to_core_range_set(const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) { - return CoreRangeSet(num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise)); +inline CoreRangeSet num_cores_to_core_range_set( + const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false) { + return CoreRangeSet(num_cores_to_corerange_set({0, 0}, target_num_cores, grid_size, row_wise)); } // This function takes in the core grid size, as well as the number of units of work to divide between the cores @@ -191,84 +153,76 @@ inline CoreRangeSet num_cores_to_core_range_set(const uint32_t target_num_cores, // the greater amount of work, and the CoreRangeSet that does less work if work cannot be evenly divided // If it can be evenly divided, the second CoreRangeSet is the same as the first, and the last is empty // The last 2 args are the units of work for the two core grids -inline std::tuple split_work_to_cores(const CoreCoord grid_size, const uint32_t units_to_divide, const bool row_wise = false) { +inline std::tuple split_work_to_cores( + const CoreCoord grid_size, const uint32_t units_to_divide, const bool row_wise = false) { ZoneScoped; - uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y; - auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y); - CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size, row_wise)); - - std::set core_group_1_set; - std::set core_group_2_set; - uint32_t units_per_core_group_1 = units_to_divide / target_num_cores; - uint32_t units_per_core_group_2 = 0; + uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y; + auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y); + CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size, row_wise)); + + std::set core_group_1_set; + std::set core_group_2_set; + uint32_t units_per_core_group_1 = units_to_divide / target_num_cores; + uint32_t units_per_core_group_2 = 0; // Evenly divided units to all target cores - if (units_to_divide % target_num_cores == 0) { - core_group_1_set = all_cores.ranges(); - // Uneven division of units across cores - // This case should only be hit when there are more units of work than a full grid of cores - // which is implicitly assumed in the following logic - } else { + if (units_to_divide % target_num_cores == 0) { + core_group_1_set = all_cores.ranges(); + // Uneven division of units across cores + // This case should only be hit when there are more units of work than a full grid of cores + // which is implicitly assumed in the following logic + } else { // Group of cores that do more work core_group_1_set = num_cores_to_corerange_set(units_to_divide % target_num_cores, grid_size, row_wise); auto last_block_group_1 = (*core_group_1_set.rbegin()); auto last_block_all_cores = (*all_cores.ranges().rbegin()); if (row_wise) { // Case where only the last row is divided between core group 1 and 2 - if (last_block_group_1.end.y == last_block_all_cores.end.y && last_block_group_1.end.x != last_block_all_cores.end.x) { + if (last_block_group_1.end.y == last_block_all_cores.end.y && + last_block_group_1.end.x != last_block_all_cores.end.x) { CoreRange leftover_block( - {last_block_group_1.end.x + 1, last_block_group_1.end.y}, - last_block_all_cores.end - ); + {last_block_group_1.end.x + 1, last_block_group_1.end.y}, last_block_all_cores.end); core_group_2_set.insert(leftover_block); } else { // Case where a middle row is divided between core group 1 and 2 if (last_block_group_1.end.x != num_cores_x - 1) { CoreRange leftover_stick( {last_block_group_1.end.x + 1, last_block_group_1.end.y}, - {num_cores_x - 1, last_block_group_1.end.y} - ); + {num_cores_x - 1, last_block_group_1.end.y}); core_group_2_set.insert(leftover_stick); } // Remaining rows of cores that does less work - CoreRange leftover_block( - {0, last_block_group_1.end.y + 1}, - last_block_all_cores.end - ); + CoreRange leftover_block({0, last_block_group_1.end.y + 1}, last_block_all_cores.end); core_group_2_set.insert(leftover_block); } } else { // Case where only the last column is divided between core group 1 and 2 - if (last_block_group_1.end.x == last_block_all_cores.end.x && last_block_group_1.end.y != last_block_all_cores.end.y) { + if (last_block_group_1.end.x == last_block_all_cores.end.x && + last_block_group_1.end.y != last_block_all_cores.end.y) { CoreRange leftover_block( - {last_block_group_1.end.x, last_block_group_1.end.y + 1}, - last_block_all_cores.end - ); + {last_block_group_1.end.x, last_block_group_1.end.y + 1}, last_block_all_cores.end); core_group_2_set.insert(leftover_block); } else { // Case where a middle column is divided between core group 1 and 2 if (last_block_group_1.end.y != num_cores_y - 1) { CoreRange leftover_stick( {last_block_group_1.end.x, last_block_group_1.end.y + 1}, - {last_block_group_1.end.x, num_cores_y - 1} - ); + {last_block_group_1.end.x, num_cores_y - 1}); core_group_2_set.insert(leftover_stick); } // Remaining columns of cores that does less work - CoreRange leftover_block( - {last_block_group_1.end.x + 1, 0}, - last_block_all_cores.end - ); + CoreRange leftover_block({last_block_group_1.end.x + 1, 0}, last_block_all_cores.end); core_group_2_set.insert(leftover_block); } } - units_per_core_group_2 = units_per_core_group_1; + units_per_core_group_2 = units_per_core_group_1; units_per_core_group_1++; - } - CoreRangeSet core_group_1(core_group_1_set); - CoreRangeSet core_group_2(core_group_2_set); + } + CoreRangeSet core_group_1(core_group_1_set); + CoreRangeSet core_group_2(core_group_2_set); - return std::make_tuple(target_num_cores, all_cores, core_group_1, core_group_2, units_per_core_group_1, units_per_core_group_2); + return std::make_tuple( + target_num_cores, all_cores, core_group_1, core_group_2, units_per_core_group_1, units_per_core_group_2); } -} // namespace tt_metal -} // namespace tt +} // namespace tt_metal +} // namespace tt diff --git a/tt_metal/programming_examples/matmul_common/work_split.hpp b/tt_metal/programming_examples/matmul_common/work_split.hpp index 23a76c4861e..b3e458e9c5a 100644 --- a/tt_metal/programming_examples/matmul_common/work_split.hpp +++ b/tt_metal/programming_examples/matmul_common/work_split.hpp @@ -17,52 +17,6 @@ namespace tt { namespace tt_metal { -// splits the tiles evenly between num_cores, -// with option of padding where necessary -struct TilesSplit { - int num_cores_; - int total_tiles_; - int tpc_; // unclipped tiles per core - - inline TilesSplit(int num_cores, int total_tiles) : num_cores_(num_cores), total_tiles_(total_tiles) { - tpc_ = div_up(total_tiles_, num_cores_); - } - - // number of tiles per core for div_up split - inline uint32_t get_tpc() const { return tpc_; } - - // number of tiles per core for close to even split with multiples of 8 going to each core - inline uint32_t get_clipped_tpc(int icore) const { - auto result = ( tpc_*(icore+1) > total_tiles_ ) ? ( total_tiles_ - tpc_*(icore+1) ) : tpc_; - return result; - } -}; - -struct CoreGridDesc { - uint32_t x_, y_; - CoreGridDesc(Device* dev) { auto gs = dev->compute_with_storage_grid_size(); x_ = gs.x; y_ = gs.y; TT_ASSERT(x_ > 0 && y_ > 0); } - uint32_t total_cores() const { return x_*y_; } - CoreCoord wrap_core(int icore) const { - TT_ASSERT(icore < total_cores()); - CoreCoord core = {(std::size_t) icore % x_, (std::size_t) icore / x_}; - return core; - } - - int numcores_dividing_numtiles(int num_tiles, int block_size = 1) const { - // since we will be splitting num_tiles into num_cores we need to find num_cores such that - // num_tiles % num_cores = 0, so that it's evenly divided since we don't support leftovers at the moment - // TODO(AP): optimize if needed, O(max_cores) atm - uint32_t max_cores = total_cores(); - TT_ASSERT(max_cores % block_size == 0 || max_cores == 1); - if (max_cores > num_tiles) - max_cores = num_tiles; - for (int j = max_cores; j >= 1; j--) - if (num_tiles % j == 0) - return j; - return 1; - } -}; - // Given a number of tiles and number of cores available // Set the largest number of cores less than the number of tiles // Returns the number of cores as well as the number of tiles per core