From 8f2e54a4b1903dc36f6b4fab092803e75d51c57a Mon Sep 17 00:00:00 2001
From: bharane_ab <bharane.amutha@multicorewareinc.com>
Date: Thu, 7 Dec 2023 15:13:03 +0000
Subject: [PATCH] #0: Add support for typecast

---
 docs/source/ttnn/dependencies/tt_lib.rst      |  2 ++
 .../python_api_testing/sweep_tests/op_map.py  |  4 +++
 .../sweep_tests/pytests/tt_dnn/test_copy.py   | 36 ++++++++++++++++---
 .../sweep_tests/pytorch_ops.py                |  5 +++
 .../sweep_tests/tt_lib_ops.py                 | 18 ++++++++++
 tt_eager/tt_dnn/op_library/copy/copy_op.cpp   |  2 +-
 tt_eager/tt_dnn/op_library/copy/copy_op.hpp   |  4 +++
 .../copy/single_core/copy_op_single_core.cpp  | 29 ++++++++-------
 .../csrc/tt_lib_bindings_tensor_dm_ops.cpp    |  1 +
 9 files changed, 82 insertions(+), 19 deletions(-)

diff --git a/docs/source/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/dependencies/tt_lib.rst
index 5feab91e188..156fb3ee51b 100644
--- a/docs/source/ttnn/dependencies/tt_lib.rst
+++ b/docs/source/ttnn/dependencies/tt_lib.rst
@@ -468,6 +468,8 @@ but in general retaining the data.
 
 .. autofunction:: tt_lib.tensor.clone
 
+.. autofunction:: tt_lib.tensor.typecast
+
 .. autofunction:: tt_lib.tensor.copy
 
 Tensor creation operations
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py
index 37ced9c2725..f1185024a47 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/op_map.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/op_map.py
@@ -22,6 +22,10 @@
         "tt_lib_op": tt_lib_ops.clone,
         "pytorch_op": pytorch_ops.clone,
     },
+    "typecast": {
+        "tt_lib_op": tt_lib_ops.typecast,
+        "pytorch_op": pytorch_ops.typecast,
+    },
     "copy": {
         "tt_lib_op": tt_lib_ops.copy,
         "pytorch_op": pytorch_ops.copy,
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py
index 4b35fc2d879..8f4d9bb727b 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py
@@ -25,12 +25,12 @@
 @pytest.mark.parametrize(
     "input_shapes",
     [
-        [[1, 1, 1, 30]],  # Single core
-        [[1, 1, 300, 380]],  # multi core
-        [[1, 3, 320, 380]],  # multi core
         [[1, 1, 32, 32]],  # Single core
-        [[1, 1, 320, 384]],  # Multi core
-        [[1, 3, 320, 384]],  # Multi core
+        # [[1, 1, 300, 380]],  # multi core
+        # [[1, 3, 320, 380]],  # multi core
+        # [[1, 1, 32, 32]],  # Single core
+        # [[1, 1, 320, 384]],  # Multi core
+        # [[1, 3, 320, 384]],  # Multi core
     ],
 )
 @pytest.mark.parametrize(
@@ -140,3 +140,29 @@ def test_run_clone_op(
             device,
             test_args,
         )
+
+    def test_run_typecast_op(
+        self,
+        input_shapes,
+        input_mem_config,
+        dst_mem_config,
+        device,
+        function_level_defaults,
+    ):
+        datagen_func = [
+            generation_funcs.gen_func_with_cast(
+                partial(generation_funcs.gen_constant, constant=torch.pi), torch.float32
+            )
+        ]
+        test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
+        test_args["input_mem_config"] = [input_mem_config]
+        test_args.update({"output_mem_config": dst_mem_config})
+        comparison_func = comparison_funcs.comp_pcc
+        run_single_pytorch_test(
+            "typecast",
+            input_shapes,
+            datagen_func,
+            comparison_func,
+            device,
+            test_args,
+        )
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
index bb68929a5c1..36756887253 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
@@ -34,6 +34,10 @@ def clone(x, *args, **kwargs):
     return torch.clone(x)
 
 
+def typecast(x, *args, **kwargs):
+    return x.to(torch.float32)
+
+
 def move(x, *args, **kwargs):
     return x
 
@@ -1035,6 +1039,7 @@ def eltwise_rsub(x, *args, **kwargs):
     dim = kwargs["factor"]
     return torch.sub(dim, x)
 
+
 def eltwise_identity(x, *args, **kwargs):
     return x
 
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py
index 23c7ec0596e..a1c4f43362e 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/tt_lib_ops.py
@@ -98,6 +98,23 @@ def clone(
     return tt2torch_tensor(t1)
 
 
+@setup_host_and_device
+def typecast(
+    x,
+    *args,
+    device,
+    dtype,
+    layout,
+    input_mem_config,
+    output_mem_config,
+    **kwargs,
+):
+    t0 = setup_tt_tensor(x, device, layout[0], input_mem_config[0], dtype[0])
+    t1 = ttl.tensor.typecast(t0, output_mem_config=output_mem_config)
+
+    return tt2torch_tensor(t1)
+
+
 @setup_host_and_device
 def move(
     x,
@@ -1702,6 +1719,7 @@ def eltwise_rsub(
 
     return tt2torch_tensor(t1)
 
+
 @setup_host_and_device
 def eltwise_identity(
     x,
diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp
index 840816dee4d..dc986ffc4f1 100644
--- a/tt_eager/tt_dnn/op_library/copy/copy_op.cpp
+++ b/tt_eager/tt_dnn/op_library/copy/copy_op.cpp
@@ -41,7 +41,7 @@ std::vector<Tensor> Copy::create_output_tensors(const std::vector<Tensor> &input
         return {};
     } else {
         const auto& input_tensor = input_tensors.at(0);
-        return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.dtype(), input_tensor.layout(), this->output_mem_config);
+        return operation::generic_create_output_tensors(*this, input_tensors, tt::tt_metal::DataType::BFLOAT16, input_tensor.layout(), this->output_mem_config);
     }
 }
 
diff --git a/tt_eager/tt_dnn/op_library/copy/copy_op.hpp b/tt_eager/tt_dnn/op_library/copy/copy_op.hpp
index 0730872ca6c..c37305fa363 100644
--- a/tt_eager/tt_dnn/op_library/copy/copy_op.hpp
+++ b/tt_eager/tt_dnn/op_library/copy/copy_op.hpp
@@ -45,6 +45,10 @@ inline Tensor clone(const Tensor& input_tensor, const MemoryConfig& output_mem_c
     return operation::run(Copy{output_mem_config}, {input_tensor}).at(0);
 }
 
+inline Tensor typecast(const Tensor& input_tensor, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) {
+    return operation::run(Copy{output_mem_config}, {input_tensor}).at(0);
+}
+
 //unary assign
 inline Tensor assign(const Tensor& input_a, const MemoryConfig& output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG) {
     return operation::run(Copy{output_mem_config}, {input_a}).at(0);
diff --git a/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp b/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp
index 3a1d0115438..f218c6aa3cc 100644
--- a/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp
+++ b/tt_eager/tt_dnn/op_library/copy/single_core/copy_op_single_core.cpp
@@ -8,6 +8,8 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/common/constants.hpp"
 #include "tt_metal/detail/util.hpp"
+#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "common/bfloat8.hpp"
 
 using namespace tt::constants;
 
@@ -35,18 +37,19 @@ operation::ProgramWithCallbacks copy_single_core(const Tensor &input, const Tens
     uint32_t src0_cb_index = CB::c_in0;
     uint32_t num_input_units = 2;
     uint32_t aligned_input_unit_size = round_up_to_mul32(input_unit_size);
+    uint32_t aligned_output_unit_size = round_up_to_mul32(output_unit_size);
     tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_units * aligned_input_unit_size, {{src0_cb_index, input_cb_data_format}})
 		.set_page_size(src0_cb_index, aligned_input_unit_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t output_cb_index = src0_cb_index; // same as input cb
-    /* If we need dataformat conversion, use output buffer + compute kernel
+    // uint32_t output_cb_index = src0_cb_index; // same as input cb
+    // If we need dataformat conversion, use output buffer + compute kernel
     uint32_t output_cb_index = 16; // output operands start at index 16
     uint32_t num_output_tiles = 2;
-    tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
-		.set_page_size(output_cb_index, output_single_tile_size);
+    tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * aligned_output_unit_size, {{output_cb_index, output_cb_data_format}})
+		.set_page_size(output_cb_index, aligned_output_unit_size);
     auto cb_output = tt_metal::CreateCircularBuffer(program, core, output_cb_config);
-    */
+    // */
 
     auto src_buffer = input.buffer();
     auto dst_buffer = output.buffer();
@@ -96,22 +99,22 @@ operation::ProgramWithCallbacks copy_single_core(const Tensor &input, const Tens
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_compile_time_args, .defines = kernel_defines});
 
-    /* If we need dataformat conversion, use compute kernel
     bool fp32_dest_acc_en = false;
     bool math_approx_mode = false;
     vector<uint32_t> compute_kernel_args = {
-        num_tiles
+        uint(num_units) // per_core_tile_cnt
     };
+
     auto eltwise_unary_kernel = tt_metal::CreateKernel(
         program,
         "tt_eager/tt_dnn/kernels/compute/eltwise_copy.cpp",
         core,
-        compute_kernel_args,
-        MathFidelity::HiFi4,
-        fp32_dest_acc_en,
-        math_approx_mode
-    );
-    */
+        tt_metal::ComputeConfig{
+            .math_fidelity = MathFidelity::HiFi4,
+            .fp32_dest_acc_en = fp32_dest_acc_en,
+            .math_approx_mode = math_approx_mode,
+            .compile_args = compute_kernel_args
+        });
 
    if (tilized) {
         SetRuntimeArgs(
diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp
index 87639670467..ee2144c5987 100644
--- a/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp
+++ b/tt_eager/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp
@@ -43,6 +43,7 @@ namespace tt::tt_metal::detail{
         detail::export_enum<BcastOpDim>(m_tensor);
 
         detail::bind_unary_op(m_tensor, "clone", &clone, R"doc(  Returns a new tensor which is a new copy of input tensor ``{0}``.)doc");
+        detail::bind_unary_op(m_tensor, "typecast", &typecast, R"doc(  Returns a new tensor which is a new copy of input tensor with new datatype``{0}``.)doc");
         detail::bind_binary_op<false, false, false>(m_tensor, "copy", &copy, R"doc(  Copies the elements from ``{0}`` into ``{1}``. ``{1}`` is modified in place.)doc");
 
         // *** tensor manipulation ***