From 8f2e54a4b1903dc36f6b4fab092803e75d51c57a Mon Sep 17 00:00:00 2001 From: bharane_ab Date: Thu, 7 Dec 2023 15:13:03 +0000 Subject: [PATCH] #0: Add support for typecast --- docs/source/ttnn/dependencies/tt_lib.rst | 2 ++ .../python_api_testing/sweep_tests/op_map.py | 4 +++ .../sweep_tests/pytests/tt_dnn/test_copy.py | 36 ++++++++++++++++--- .../sweep_tests/pytorch_ops.py | 5 +++ .../sweep_tests/tt_lib_ops.py | 18 ++++++++++ tt_eager/tt_dnn/op_library/copy/copy_op.cpp | 2 +- tt_eager/tt_dnn/op_library/copy/copy_op.hpp | 4 +++ .../copy/single_core/copy_op_single_core.cpp | 29 ++++++++------- .../csrc/tt_lib_bindings_tensor_dm_ops.cpp | 1 + 9 files changed, 82 insertions(+), 19 deletions(-) diff --git a/docs/source/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/dependencies/tt_lib.rst index 5feab91e188..156fb3ee51b 100644 --- a/docs/source/ttnn/dependencies/tt_lib.rst +++ b/docs/source/ttnn/dependencies/tt_lib.rst @@ -468,6 +468,8 @@ but in general retaining the data. .. autofunction:: tt_lib.tensor.clone +.. autofunction:: tt_lib.tensor.typecast + .. autofunction:: tt_lib.tensor.copy Tensor creation operations diff --git a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py index 37ced9c2725..f1185024a47 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py @@ -22,6 +22,10 @@ "tt_lib_op": tt_lib_ops.clone, "pytorch_op": pytorch_ops.clone, }, + "typecast": { + "tt_lib_op": tt_lib_ops.typecast, + "pytorch_op": pytorch_ops.typecast, + }, "copy": { "tt_lib_op": tt_lib_ops.copy, "pytorch_op": pytorch_ops.copy, diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py index 4b35fc2d879..8f4d9bb727b 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py @@ -25,12 +25,12 @@ @pytest.mark.parametrize( "input_shapes", [ - [[1, 1, 1, 30]], # Single core - [[1, 1, 300, 380]], # multi core - [[1, 3, 320, 380]], # multi core [[1, 1, 32, 32]], # Single core - [[1, 1, 320, 384]], # Multi core - [[1, 3, 320, 384]], # Multi core + # [[1, 1, 300, 380]], # multi core + # [[1, 3, 320, 380]], # multi core + # [[1, 1, 32, 32]], # Single core + # [[1, 1, 320, 384]], # Multi core + # [[1, 3, 320, 384]], # Multi core ], ) @pytest.mark.parametrize( @@ -140,3 +140,29 @@ def test_run_clone_op( device, test_args, ) + + def test_run_typecast_op( + self, + input_shapes, + input_mem_config, + dst_mem_config, + device, + function_level_defaults, + ): + datagen_func = [ + generation_funcs.gen_func_with_cast( + partial(generation_funcs.gen_constant, constant=torch.pi), torch.float32 + ) + ] + test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0] + test_args["input_mem_config"] = [input_mem_config] + test_args.update({"output_mem_config": dst_mem_config}) + comparison_func = comparison_funcs.comp_pcc + run_single_pytorch_test( + "typecast", + input_shapes, + datagen_func, + comparison_func, + device, + test_args, + ) diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py index bb68929a5c1..36756887253 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py @@ -34,6 +34,10 @@ def clone(x, *args, **kwargs): return torch.clone(x) +def typecast(x, *args, **kwargs): + return x.to(torch.float32) + + def move(x, *args, **kwargs): return x @@ -1035,6 +1039,7 @@ def eltwise_rsub(x, *args, **kwargs): dim = kwargs["factor"] return torch.sub(dim, x) + def eltwise_identity(x, *args, **kwargs): return x diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py index 23c7ec0596e..a1c4f43362e 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py @@ -98,6 +98,23 @@ def clone( return tt2torch_tensor(t1) +@setup_host_and_device +def typecast( + x, + *args, + device, + dtype, + layout, + input_mem_config, + output_mem_config, + **kwargs, +): + t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0]) + t1 = ttl.tensor.typecast(t0, output_mem_config=output_mem_config) + + return tt2torch_tensor(t1) + + @setup_host_and_device def move( x, @@ -1702,6 +1719,7 @@ def eltwise_rsub( return tt2torch_tensor(t1) + @setup_host_and_device def eltwise_identity( x, diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp index 840816dee4d..dc986ffc4f1 100644 --- a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp +++ b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp @@ -41,7 +41,7 @@ std::vector Copy::create_output_tensors(const std::vector &input return {}; } else { const auto& input_tensor = input_tensors.at(0); - return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.dtype(), input_tensor.layout(), this->output_mem_config); + return operation::generic_create_output_tensors(*this, input_tensors, tt::tt_metal::DataType::BFLOAT16, input_tensor.layout(), this->output_mem_config); } } diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.hpp b/tt_eager/tt_dnn/op_library/copy/copy_op.hpp index 0730872ca6c..c37305fa363 100644 --- a/tt_eager/tt_dnn/op_library/copy/copy_op.hpp +++ b/tt_eager/tt_dnn/op_library/copy/copy_op.hpp @@ -45,6 +45,10 @@ inline Tensor clone(const Tensor& input_tensor, const MemoryConfig& output_mem_c return operation::run(Copy{output_mem_config}, {input_tensor}).at(0); } +inline Tensor typecast(const Tensor& input_tensor, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) { + return operation::run(Copy{output_mem_config}, {input_tensor}).at(0); +} + //unary assign inline Tensor assign(const Tensor& input_a, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) { return operation::run(Copy{output_mem_config}, {input_a}).at(0); diff --git a/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp b/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp index 3a1d0115438..f218c6aa3cc 100644 --- a/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp +++ b/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp @@ -8,6 +8,8 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/common/constants.hpp" #include "tt_metal/detail/util.hpp" +#include "tt_metal/common/tt_backend_api_types.hpp" +#include "common/bfloat8.hpp" using namespace tt::constants; @@ -35,18 +37,19 @@ operation::ProgramWithCallbacks copy_single_core(const Tensor &input, const Tens uint32_t src0_cb_index = CB::c_in0; uint32_t num_input_units = 2; uint32_t aligned_input_unit_size = round_up_to_mul32(input_unit_size); + uint32_t aligned_output_unit_size = round_up_to_mul32(output_unit_size); tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_units * aligned_input_unit_size, {{src0_cb_index, input_cb_data_format}}) .set_page_size(src0_cb_index, aligned_input_unit_size); auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config); - uint32_t output_cb_index = src0_cb_index; // same as input cb - /* If we need dataformat conversion, use output buffer + compute kernel + // uint32_t output_cb_index = src0_cb_index; // same as input cb + // If we need dataformat conversion, use output buffer + compute kernel uint32_t output_cb_index = 16; // output operands start at index 16 uint32_t num_output_tiles = 2; - tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}}) - .set_page_size(output_cb_index, output_single_tile_size); + tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * aligned_output_unit_size, {{output_cb_index, output_cb_data_format}}) + .set_page_size(output_cb_index, aligned_output_unit_size); auto cb_output = tt_metal::CreateCircularBuffer(program, core, output_cb_config); - */ + // */ auto src_buffer = input.buffer(); auto dst_buffer = output.buffer(); @@ -96,22 +99,22 @@ operation::ProgramWithCallbacks copy_single_core(const Tensor &input, const Tens core, tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_compile_time_args, .defines = kernel_defines}); - /* If we need dataformat conversion, use compute kernel bool fp32_dest_acc_en = false; bool math_approx_mode = false; vector compute_kernel_args = { - num_tiles + uint(num_units) // per_core_tile_cnt }; + auto eltwise_unary_kernel = tt_metal::CreateKernel( program, "tt_eager/tt_dnn/kernels/compute/eltwise_copy.cpp", core, - compute_kernel_args, - MathFidelity::HiFi4, - fp32_dest_acc_en, - math_approx_mode - ); - */ + tt_metal::ComputeConfig{ + .math_fidelity = MathFidelity::HiFi4, + .fp32_dest_acc_en = fp32_dest_acc_en, + .math_approx_mode = math_approx_mode, + .compile_args = compute_kernel_args + }); if (tilized) { SetRuntimeArgs( diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp index 87639670467..ee2144c5987 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp @@ -43,6 +43,7 @@ namespace tt::tt_metal::detail{ detail::export_enum(m_tensor); detail::bind_unary_op(m_tensor, "clone", &clone, R"doc( Returns a new tensor which is a new copy of input tensor ``{0}``.)doc"); + detail::bind_unary_op(m_tensor, "typecast", &typecast, R"doc( Returns a new tensor which is a new copy of input tensor with new datatype``{0}``.)doc"); detail::bind_binary_op(m_tensor, "copy", ©, R"doc( Copies the elements from ``{0}`` into ``{1}``. ``{1}`` is modified in place.)doc"); // *** tensor manipulation ***