diff --git a/.github/workflows/pipeline-select-t3k.yaml b/.github/workflows/pipeline-select-t3k.yaml
index 3df726bdd6e..d602c81a73e 100644
--- a/.github/workflows/pipeline-select-t3k.yaml
+++ b/.github/workflows/pipeline-select-t3k.yaml
@@ -12,6 +12,10 @@ on:
           - RelWithDebInfo
           - CI
         default: "Release"
+      extra-tag:
+        required: true
+        type: string
+        default: "in-service"
       build-with-tracy:
         required: false
         type: boolean
@@ -54,29 +58,41 @@ jobs:
     needs: build-artifact
     secrets: inherit
     uses: ./.github/workflows/t3000-unit-tests-impl.yaml
+    with:
+      extra-tag: ${{ inputs.extra-tag }}
     if: ${{ inputs.t3000-unit }}
   t3000-demo-tests:
     needs: build-artifact
     secrets: inherit
     uses: ./.github/workflows/t3000-demo-tests-impl.yaml
+    with:
+      extra-tag: ${{ inputs.extra-tag }}
     if: ${{ inputs.t3000-demo }}
   t3000-frequent-tests:
     needs: build-artifact
     secrets: inherit
     uses: ./.github/workflows/t3000-frequent-tests-impl.yaml
+    with:
+      extra-tag: ${{ inputs.extra-tag }}
     if: ${{ inputs.t3000-frequent }}
   t3000-nightly-tests:
     needs: build-artifact
     secrets: inherit
     uses: ./.github/workflows/t3000-nightly-tests-impl.yaml
+    with:
+      extra-tag: ${{ inputs.extra-tag }}
     if: ${{ inputs.t3000-nightly }}
   t3000-model-perf-tests:
     needs: build-artifact
     secrets: inherit
     uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml
+    with:
+      extra-tag: ${{ inputs.extra-tag }}
     if: ${{ inputs.t3000-model-perf }}
   t3000-profiler-tests:
     needs: build-artifact
     secrets: inherit
     uses: ./.github/workflows/t3000-profiler-tests-impl.yaml
+    with:
+      extra-tag: ${{ inputs.extra-tag }}
     if: ${{ inputs.t3000-profiler }}
diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml
index 8c3d98815a3..48f0467d2cb 100644
--- a/.github/workflows/t3000-demo-tests-impl.yaml
+++ b/.github/workflows/t3000-demo-tests-impl.yaml
@@ -2,6 +2,11 @@ name: "[internal] T3000 demo tests impl"
 
 on:
   workflow_call:
+    inputs:
+      extra-tag:
+        required: false
+        type: string
+        default: "in-service"
 
 jobs:
   t3000-demo-tests:
@@ -23,7 +28,11 @@ jobs:
       ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"]
+    runs-on:
+      - arch-wormhole_b0
+      - config-t3000
+      - pipeline-perf
+      - ${{ inputs.extra-tag }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
       - name: Enable performance mode
diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml
index f770b1db457..67036141b0e 100644
--- a/.github/workflows/t3000-frequent-tests-impl.yaml
+++ b/.github/workflows/t3000-frequent-tests-impl.yaml
@@ -2,6 +2,11 @@ name: "[internal] T3000 frequent tests impl"
 
 on:
   workflow_call:
+    inputs:
+      extra-tag:
+        required: false
+        type: string
+        default: "in-service"
 
 jobs:
   t3000-frequent-tests:
@@ -27,7 +32,11 @@ jobs:
       ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-functional"]
+    runs-on:
+      - arch-wormhole_b0
+      - config-t3000
+      - pipeline-functional
+      - ${{ inputs.extra-tag }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
       - uses: ./.github/actions/ensure-active-weka-mount
diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index 1787bd5c2e9..91e208c214b 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -2,6 +2,11 @@ name: "[internal] T3000 model perf tests impl"
 
 on:
   workflow_call:
+    inputs:
+      extra-tag:
+        required: false
+        type: string
+        default: "in-service"
 
 jobs:
 
@@ -25,7 +30,11 @@ jobs:
       ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"]
+    runs-on:
+      - arch-wormhole_b0
+      - config-t3000
+      - pipeline-perf
+      - ${{ inputs.extra-tag }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
       - name: Enable performance mode
diff --git a/.github/workflows/t3000-nightly-tests-impl.yaml b/.github/workflows/t3000-nightly-tests-impl.yaml
index 009e6549d19..2255a5b4f6d 100644
--- a/.github/workflows/t3000-nightly-tests-impl.yaml
+++ b/.github/workflows/t3000-nightly-tests-impl.yaml
@@ -2,6 +2,11 @@ name: "[internal] T3000 nightly tests impl"
 
 on:
   workflow_call:
+    inputs:
+      extra-tag:
+        required: false
+        type: string
+        default: "in-service"
 
 jobs:
   t3000-nightly-tests:
@@ -18,7 +23,11 @@ jobs:
       ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-functional"]
+    runs-on:
+      - arch-wormhole_b0
+      - config-t3000
+      - pipeline-functional
+      - ${{ inputs.extra-tag }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
       - name: Set up dynamic env vars for build
diff --git a/.github/workflows/t3000-profiler-tests-impl.yaml b/.github/workflows/t3000-profiler-tests-impl.yaml
index ebd4940bb68..a703ed83b1d 100644
--- a/.github/workflows/t3000-profiler-tests-impl.yaml
+++ b/.github/workflows/t3000-profiler-tests-impl.yaml
@@ -2,6 +2,11 @@ name: "[internal] T3000 profiler tests impl"
 
 on:
   workflow_call:
+    inputs:
+      extra-tag:
+        required: false
+        type: string
+        default: "in-service"
 
 jobs:
   t3000-profiler-tests:
@@ -12,7 +17,7 @@ jobs:
           {
             name: "T3000 profiler tests",
             arch: wormhole_b0,
-            runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"],
+            runs-on: ["arch-wormhole_b0", "config-t3000", "pipeline-perf", "${{ inputs.extra-tag }}"],
             cmd: './tests/scripts/run_profiler_regressions.sh'
           },
         ]
diff --git a/.github/workflows/t3000-unit-tests-impl.yaml b/.github/workflows/t3000-unit-tests-impl.yaml
index 0fce1a2b5e5..66e32904a0c 100644
--- a/.github/workflows/t3000-unit-tests-impl.yaml
+++ b/.github/workflows/t3000-unit-tests-impl.yaml
@@ -2,6 +2,11 @@ name: "[internal] T3000 unit tests impl"
 
 on:
   workflow_call:
+    inputs:
+      extra-tag:
+        required: false
+        type: string
+        default: "in-service"
 
 jobs:
   t3000-unit-tests:
@@ -28,7 +33,11 @@ jobs:
       ARCH_NAME: ${{ matrix.test-group.arch }}
       LOGURU_LEVEL: INFO
       LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
-    runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-functional"]
+    runs-on:
+      - arch-wormhole_b0
+      - config-t3000
+      - pipeline-functional
+      - ${{ inputs.extra-tag }}
     steps:
       - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
       - name: Set up dynamic env vars for build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b03b9363675..bbac64a095b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -263,7 +263,6 @@ if("$ENV{ARCH_NAME}" STREQUAL "wormhole_b0")
             tt_metal/hw/inc/wormhole
             ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/wormhole/wormhole_b0_defines
             ${UMD_HOME}/device/wormhole
-            ${UMD_HOME}/src/firmware/riscv/wormhole
     )
 else()
     target_include_directories(
@@ -271,7 +270,6 @@ else()
         INTERFACE
             ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/$ENV{ARCH_NAME}
             ${UMD_HOME}/device/$ENV{ARCH_NAME}
-            ${UMD_HOME}/src/firmware/riscv/$ENV{ARCH_NAME}
     )
 endif()
 
diff --git a/models/demos/llama3/lt b/models/demos/llama3/lt
index c69b4113a84..70beb4e6e98 100755
--- a/models/demos/llama3/lt
+++ b/models/demos/llama3/lt
@@ -14,6 +14,7 @@ import re
 import time
 import signal
 import psutil
+import json
 
 
 def ensure_less_installed():
@@ -52,6 +53,204 @@ def ensure_ttsmi_installed():
         sys.exit(1)
 
 
+class OutputEntryList:
+    def __init__(self):
+        self._entries = []
+        # Create logs directory
+        os.makedirs("logs", exist_ok=True)
+        # Load existing state
+        self._load_state()
+
+    def _load_state(self):
+        try:
+            with open("logs/state.json", "r") as f:
+                state = json.load(f)
+                for entry_data in state:
+                    entry = Entry(
+                        entry_data["command_name"],
+                        entry_data["model"],
+                        entry_data["device"],
+                        entry_data["command_input"],
+                    )
+                    # Restore saved attributes
+                    entry.status = (
+                        "Cancelled"
+                        if entry_data["status"]
+                        in [
+                            "Running",
+                            "Resetting",
+                            "Initializing device",
+                            "Starting",
+                            "Prefill",
+                            "Decode",
+                            "Terminating",
+                            "Exiting",
+                        ]
+                        else entry_data["status"]
+                    )
+                    entry.output = entry_data["output"]
+                    entry.log_id = entry_data["log_id"]
+                    entry.speed = entry_data["speed"]
+                    entry.pcc = entry_data["pcc"]
+                    self._entries.append(entry)
+        except (FileNotFoundError, json.JSONDecodeError):
+            pass
+
+    def save_state(self):
+        state = []
+        for entry in self._entries:
+            entry_data = {
+                "command_name": entry.command_name,
+                "model": entry.model,
+                "device": entry.device,
+                "command_input": entry.command_input,
+                "status": entry.status,
+                "output": entry.output,
+                "log_id": entry.log_id,
+            }
+            if hasattr(entry, "speed"):
+                entry_data["speed"] = entry.speed
+            if hasattr(entry, "pcc"):
+                entry_data["pcc"] = entry.pcc
+            state.append(entry_data)
+
+        with open("logs/state.json", "w") as f:
+            json.dump(state, f, indent=2)
+
+    def __len__(self):
+        return len(self._entries)
+
+    def __getitem__(self, index):
+        return self._entries[index]
+
+    def __iter__(self):
+        return iter(self._entries)
+
+    def append(self, entry):
+        if entry.log_id is None:
+            entry.log_id = self.next_log_id()
+
+        # Remove any existing logs with same ID
+        for existing_file in os.listdir("logs"):
+            if existing_file.startswith(entry.log_prefix):
+                os.remove(os.path.join("logs", existing_file))
+
+        # Set parent list reference before appending
+        entry.set_parent_list(self)
+        self._entries.append(entry)
+        self.save_state()
+
+    def pop(self, index):
+        result = self._entries.pop(index)
+
+        try:
+            os.remove(result.get_log_filename())
+        except (OSError, FileNotFoundError):
+            pass
+
+        # Mark all subsequent entries as changed
+        for entry in self._entries[index:]:
+            entry.mark_changed()
+
+        self.save_state()
+        return result
+
+    def index(self, entry):
+        return self._entries.index(entry)
+
+    def get_entries(self):
+        return self._entries
+
+    def next_log_id(self):
+        # Fix potential issue with list comprehension on dictionary access
+        max_id = 0
+        for entry in self._entries:
+            if entry.log_id > max_id:
+                max_id = entry.log_id
+        return max_id + 1
+
+
+class Entry:
+    def __init__(self, command_name, model, device, command_input):
+        self.command_name = command_name
+        self.model = model
+        self.device = device.upper()
+        self.command_input = command_input
+        self.status = "Waiting"
+        self.output = ""
+        self.process = None
+        self.log_file = None
+        self.stop_event = threading.Event()
+        self.lock = threading.Lock()
+        self.log_id = None  # Will be set by OutputEntryList
+        self.speed = None
+        self.pcc = None
+        self.thread = None
+        self.changed = True  # Initialize as changed to ensure first draw
+        self._parent_list = None  # Reference to parent OutputEntryList
+
+    @property
+    def log_prefix(self):
+        """Generate the log file prefix based on the entry's log ID"""
+        return f"{self.log_id:04d}-"
+
+    def mark_changed(self):
+        self.changed = True
+
+    def mark_drawn(self):
+        self.changed = False
+
+    def __setattr__(self, name, value):
+        super().__setattr__(name, value)
+        # Mark as changed whenever any attribute is modified
+        # (except for 'changed' itself to avoid recursion)
+        if name != "changed" and hasattr(self, "changed"):
+            self.changed = True
+        # Save state if we have a parent list and this isn't an unpersisted attribute
+        if (
+            hasattr(self, "_parent_list")
+            and self._parent_list
+            and name not in ["process", "log_file", "stop_event", "lock", "thread"]
+        ):
+            self._parent_list.save_state()
+
+    def __getitem__(self, key):
+        # Support dictionary-style access for backward compatibility
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        # Support dictionary-style assignment for backward compatibility
+        setattr(self, key, value)
+
+    def get(self, key, default=None):
+        # Support dictionary-style get() for backward compatibility
+        return getattr(self, key, default)
+
+    def get_log_filename(self):
+        """Generate log filename based on entry properties"""
+        command_name = self._get_command_name()
+        filename = f"{self.log_prefix}{self.device}-{self.model}-{command_name}.log"
+        return os.path.join("logs", filename.replace("/", "_"))
+
+    def _get_command_name(self):
+        """Extract command name from command input"""
+        if "pytest" in self.command_input:
+            match = re.search(r"pytest\s+([\S]+)", self.command_input)
+            if match:
+                test_file = match.group(1)
+                return os.path.basename(test_file).split(".")[0]
+            return "pytest"
+        return os.path.basename(shlex.split(self.command_input)[0])
+
+    def open_log_file(self):
+        """Open and return log file for writing"""
+        self.log_file = open(self.get_log_filename(), "w")
+        return self.log_file
+
+    def set_parent_list(self, parent_list):
+        self._parent_list = parent_list
+
+
 def main(stdscr):
     curses.curs_set(0)  # Hide cursor
     curses.start_color()
@@ -69,9 +268,8 @@ def main(stdscr):
         {"label": "Device (n150, n300, t3k) [all]", "value": "", "x": 0, "y": 2},
     ]
 
-    output_entries = []
+    output_entries = OutputEntryList()
     current_line = 0  # Index of the current line (input fields + output entries)
-    total_lines = len(input_fields)
 
     screen_lock = threading.Lock()
     screen_needs_update = threading.Event()  # New event to signal screen updates
@@ -201,26 +399,14 @@ def main(stdscr):
                         # Create output entries
                         for command, model, device in combinations:
                             command_name = get_command_name(command)
-                            entry = {
-                                "command_name": command_name,
-                                "model": model,
-                                "device": device.upper(),
-                                "status": "Waiting",
-                                "output": "",
-                                "process": None,
-                                "log_file": None,
-                                "index": len(output_entries),
-                                "stop_event": threading.Event(),
-                                "lock": threading.Lock(),
-                                "command_input": command,  # Save the individual command
-                            }
+                            entry = Entry(command_name, model, device, command)
                             output_entries.append(entry)
-                        # Update total_lines
-                        total_lines = len(input_fields) + len(output_entries)
+
                         current_line = 0
                         screen_needs_update.set()
                     else:
                         # Otherwise if not the last field, move to next field
+                        total_lines = len(input_fields) + len(output_entries)
                         current_line = (current_line + 1) % total_lines
                         screen_needs_update.set()
                 else:
@@ -228,13 +414,13 @@ def main(stdscr):
                     entry_index = current_line - len(input_fields)
                     if entry_index < len(output_entries):
                         entry = output_entries[entry_index]
-                        if entry["log_file"]:
+                        if os.path.exists(entry.get_log_filename()):
                             # Save current terminal state
                             curses.def_prog_mode()
                             # Exit curses temporarily
                             curses.endwin()
                             # Run less command
-                            os.system(f"less -R {entry['log_file'].name}")
+                            os.system(f"less -R {entry.get_log_filename()}")
                             # Resume curses
                             curses.reset_prog_mode()
                             stdscr.refresh()
@@ -253,20 +439,26 @@ def main(stdscr):
                 entry_index = current_line - len(input_fields)
                 if entry_index < len(output_entries):
                     entry = output_entries[entry_index]
-                    with entry["lock"]:
-                        if entry["process"] and entry["process"].poll() is None:
-                            # Cancel the running process
-                            entry["stop_event"].set()
-                            terminate_process_tree(entry["process"].pid)
-                            entry["status"] = "Terminating"
-                        elif entry["status"] != "Resetting":
-                            # Remove the entry if it's already cancelled
-                            output_entries.pop(entry_index)
-                            total_lines -= 1
-                            if current_line >= total_lines:
-                                current_line = total_lines - 1
+                    if cancel_entry(entry):
+                        output_entries.pop(entry_index)
+                        total_lines = len(input_fields) + len(output_entries)
+                        if current_line >= total_lines:
+                            current_line = total_lines - 1
                     screen_needs_update.set()
+        elif c == ord("X") and current_line >= len(input_fields):  # Shift-X to clear all entries
+            to_remove = []
+            for entry in output_entries:
+                if cancel_entry(entry):
+                    to_remove.append(entry)
+            for entry in to_remove:
+                entry_index = output_entries.index(entry)
+                output_entries.pop(entry_index)
+            screen_needs_update.set()
+            total_lines = len(input_fields) + len(output_entries)
+            if current_line >= total_lines:
+                current_line = total_lines - 1
         elif c == 9:  # Tab key
+            total_lines = len(input_fields) + len(output_entries)
             current_line = (current_line + 1) % total_lines
             screen_needs_update.set()
         elif c == ord("r") and current_line >= len(input_fields):
@@ -278,10 +470,8 @@ def main(stdscr):
                         # Reset the entry to "Waiting" status
                         entry["status"] = "Waiting"
                         entry["output"] = ""
-                        if "speed" in entry:
-                            del entry["speed"]
-                        if "pcc" in entry:
-                            del entry["pcc"]
+                        entry["speed"] = None
+                        entry["pcc"] = None
                         entry["process"] = None
                         entry["log_file"] = None
                         entry["stop_event"].clear()
@@ -385,28 +575,24 @@ def draw_changes(stdscr, input_fields, output_entries, current_line, last_drawn_
     output_start_y = header_y + 1
     for idx, entry in enumerate(output_entries):
         y = output_start_y + idx
-        if y >= max_y - 3:  # Change this line from max_y - 2 to max_y - 3
+        if y >= max_y - 3:
             break
-        if (
-            idx >= len(last_drawn_state["output_entries"])
-            or entry != last_drawn_state["output_entries"][idx]
-            or current_line != last_drawn_state["current_line"]
-        ):
+
+        # Only draw if entry has changed or selection state changed
+        if entry.changed or current_line != last_drawn_state["current_line"]:
             draw_output_entry(stdscr, entry, y, current_line == len(input_fields) + idx, max_x)
+            entry.mark_drawn()  # Mark as drawn after updating
 
     # Clear any extra lines if output entries were removed
     for y in range(
         output_start_y + len(output_entries),
-        min(
-            output_start_y + len(last_drawn_state["output_entries"]), max_y - 3
-        ),  # Change this line from max_y - 2 to max_y - 3
+        min(output_start_y + len(last_drawn_state["output_entries"]), max_y - 3),
     ):
         stdscr.move(y, 0)
         stdscr.clrtoeol()
 
-    # Update last_drawn_state
-    last_drawn_state["output_entries"] = [entry.copy() for entry in output_entries]
     last_drawn_state["current_line"] = current_line
+    last_drawn_state["output_entries"] = [{"log_id": entry.log_id} for entry in output_entries]
 
 
 def draw_input_field(stdscr, field, is_selected, max_x):
@@ -423,13 +609,13 @@ def draw_input_field(stdscr, field, is_selected, max_x):
 
 def draw_output_entry(stdscr, entry, y, is_selected, max_x):
     cols = [
-        entry["command_name"],
-        entry["model"],
-        entry["device"],
-        entry["status"],
-        entry.get("speed", ""),
-        entry.get("pcc", ""),
-        entry["output"],
+        entry.command_name,
+        entry.model,
+        entry.device,
+        entry.status,
+        entry.speed if entry.speed else "",
+        entry.pcc if entry.pcc else "",
+        entry.output,
     ]
     col_widths = [20, 10, 10, 20, 10, 10, max_x - 85]  # Adjusted widths to accommodate the PCC column
 
@@ -441,7 +627,7 @@ def draw_output_entry(stdscr, entry, y, is_selected, max_x):
         else:
             color = curses.color_pair(0)
             if i == 3:  # Status column
-                status = entry["status"]
+                status = entry.status
                 if status == "Waiting" or status == "Cancelled":
                     color = COLOR_PAIR_WAITING
                 elif status in ["Running", "Initializing device", "Prefill", "Decode", "Starting"] or status.startswith(
@@ -527,10 +713,13 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update):
     env["FAKE_DEVICE"] = entry["device"]
     env["LLAMA_DIR"] = get_llama_dir(entry["model"])
 
+    # Open log file
+    entry.open_log_file()
+
     # Define command shortcuts
     command_shortcuts = {
         "demo": "pytest models/demos/llama3/demo/demo.py -k instruct_weights-1",
-        "demo_1layer": "pytest models/demos/llama3/demo/demo.py -k single_layer",
+        "demo-1layer": "pytest models/demos/llama3/demo/demo.py -k single_layer",
         "attention": "pytest models/demos/llama3/tests/test_llama_attention.py",
         "attention-prefill": "pytest models/demos/llama3/tests/test_llama_attention_prefill.py",
         "mlp": "pytest models/demos/llama3/tests/test_llama_mlp.py",
@@ -575,11 +764,6 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update):
     # Prepare the command
     cmd_list = shlex.split(command_input)
 
-    # Open log file
-    log_filename = get_log_filename(entry["device"], entry["model"], command_input)
-    os.makedirs("logs", exist_ok=True)
-    entry["log_file"] = open(os.path.join("logs", log_filename), "w")
-
     # If the command is invalid, write the output to the log file and return before trying to run the bad command
     if entry["status"] == "Error":
         entry["log_file"].write(entry["output"] + "\n")
@@ -600,31 +784,28 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update):
 
 
 def process_output(entry, screen_lock, output_entries, screen_needs_update):
-    process = entry["process"]
-    log_file = entry["log_file"]
+    process = entry.process
+    log_file = entry.log_file
     previous_line = ""
     try:
         for line in iter(process.stdout.readline, ""):
-            if entry["stop_event"].is_set():
-                break
             # Write to log file
             log_file.write(line)
             log_file.flush()
 
             # Update status and output based on output
-            status, output, speed, pcc = parse_output_line(line, previous_line, entry["status"])
+            status, output, speed, pcc = parse_output_line(line, previous_line, entry.status)
             previous_line = line.strip()
-            with entry["lock"]:
-                if status != entry["status"] or output or speed is not None or pcc is not None:
-                    entry["status"] = status
+            with entry.lock:
+                if status != entry.status or output or speed is not None or pcc is not None:
+                    entry.status = status  # This will mark entry as changed via __setattr__
                     if output:
-                        entry["output"] = output
+                        entry.output = output
                     if speed is not None:
-                        entry["speed"] = f"{speed:.1f}"
+                        entry.speed = f"{speed:.1f}"
                     if pcc is not None:
-                        current_pcc = entry.get("pcc")
-                        if current_pcc is None or float(pcc) < float(current_pcc):
-                            entry["pcc"] = pcc
+                        if entry.pcc is None or float(pcc) < float(entry.pcc):
+                            entry.pcc = pcc
                     screen_needs_update.set()
 
             with screen_lock:
@@ -637,17 +818,19 @@ def process_output(entry, screen_lock, output_entries, screen_needs_update):
         # Wait for the process to fully terminate
         process.wait()
 
-        with entry["lock"]:
+        with entry.lock:
             if process.returncode != 0:
-                exception_name = find_exception_in_log(entry["log_file"].name)
-                entry["status"] = "Error"
-                if exception_name:
-                    entry["output"] = exception_name
+                if entry.stop_event.is_set():
+                    entry.status = "Cancelled"
+                else:
+                    exception_name = find_exception_in_log(entry.log_file.name)
+                    entry.status = "Error"
+                    if exception_name:
+                        entry.output = exception_name
                     reset_device_async(entry, screen_lock, screen_needs_update)
-                    screen_needs_update.set()
             else:
-                entry["status"] = "Finished"
-        entry["process"] = None
+                entry.status = "Finished"
+        entry.process = None
         log_file.close()
 
         screen_needs_update.set()  # Ensure screen is updated after process termination
@@ -746,13 +929,6 @@ def get_command_name(command_input):
     return command_name
 
 
-def get_log_filename(device, model, command_input):
-    command_name = get_command_name(command_input)
-    filename = f"{device}-{model}-{command_name}.log"
-    filename = filename.replace("/", "_")
-    return filename
-
-
 def find_exception_in_log(log_filename):
     exception_name = None
     with open(log_filename, "r") as f:
@@ -807,12 +983,12 @@ def reset_device_async(entry, screen_lock, screen_needs_update):
         except subprocess.CalledProcessError as e:
             pass
         finally:
-            with entry["lock"]:
-                entry["status"] = previous_status
+            with entry.lock:
+                entry.status = previous_status
             screen_needs_update.set()
 
-    previous_status = entry["status"]
-    entry["status"] = "Resetting"
+    previous_status = entry.status
+    entry.status = "Resetting"
     reset_thread = threading.Thread(target=reset_thread)
     reset_thread.daemon = True
     reset_thread.start()
@@ -827,13 +1003,28 @@ def draw_help_bar(stdscr, current_line, num_input_fields, num_output_entries):
 
 def get_help_text(current_line, num_input_fields, num_output_entries):
     if current_line == 0:
-        return "Shortcuts: demo, attention, mlp, decoder, decoder-prefill, model, model-prefill, model-quick | Enter: Submit | ↑↓: Navigate fields | Esc: Exit"
+        return "Shortcuts: demo, demo-1layer, attention, mlp, rmsnorm, decoder, model, model-quick, 'help' for full list | Enter: Submit | ↑↓: Navigate fields | Esc: Exit"
     elif current_line <= num_input_fields - 1:
         return "Enter: Next field | ↑↓: Navigate fields | Esc: Exit"
     else:
-        return (
-            "Enter: View log | Backspace/x: Cancel/remove entry | r: Restart entry | ↑↓: Navigate entries | Esc: Exit"
-        )
+        return "Enter: View log | Backspace/x: Cancel entry | X: Cancel all | r: Restart entry | ↑↓: Navigate entries | Esc: Exit"
+
+
+def cancel_entry(entry):
+    """Handle removal of a single entry, returning True if entry was removed"""
+    with entry["lock"]:
+        if entry["process"] and entry["process"].poll() is None:
+            # Cancel the running process
+            entry["stop_event"].set()
+            terminate_process_tree(entry["process"].pid)
+            entry["status"] = "Terminating"
+            # Entry is still running, so don't remove it
+            return False
+        elif entry["status"] != "Resetting":
+            # Safe to remove the entry if it's already cancelled
+            return True
+    # Entry is running/resetting, so don't remove it
+    return False
 
 
 if __name__ == "__main__":
diff --git a/models/demos/segformer/README.md b/models/demos/segformer/README.md
new file mode 100644
index 00000000000..796a7fcdfd2
--- /dev/null
+++ b/models/demos/segformer/README.md
@@ -0,0 +1,20 @@
+# Segformer Demo
+
+## How to run demo
+
+- Use the following command to run the Segformer Encoder model (Classification):
+  ```python
+  pytest tests/ttnn/integration_tests/segformer/test_segformer_encoder.py
+  ```
+
+
+- Use the following command to run the Segformer Decoder module model:
+  ```python
+  pytest tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py
+  ```
+
+
+- Use the following command to run the Segformer full model (Segmentation):
+  ```python
+  pytest tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py
+  ```
diff --git a/models/experimental/functional_segformer/reference/segformer_attention.py b/models/demos/segformer/reference/segformer_attention.py
similarity index 82%
rename from models/experimental/functional_segformer/reference/segformer_attention.py
rename to models/demos/segformer/reference/segformer_attention.py
index c87af0938ab..b3ddfcdad3b 100644
--- a/models/experimental/functional_segformer/reference/segformer_attention.py
+++ b/models/demos/segformer/reference/segformer_attention.py
@@ -3,10 +3,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import torch.nn as nn
-from models.experimental.functional_segformer.reference.segformer_efficient_selfattention import (
+from models.demos.segformer.reference.segformer_efficient_selfattention import (
     SegformerEfficientSelfAttention,
 )
-from models.experimental.functional_segformer.reference.segformer_selfoutput import SegformerSelfOutput
+from models.demos.segformer.reference.segformer_selfoutput import SegformerSelfOutput
 
 
 class SegformerAttention(nn.Module):
diff --git a/models/experimental/functional_segformer/reference/segformer_decode_head.py b/models/demos/segformer/reference/segformer_decode_head.py
similarity index 97%
rename from models/experimental/functional_segformer/reference/segformer_decode_head.py
rename to models/demos/segformer/reference/segformer_decode_head.py
index c0d4c15b35d..c8f97106fd1 100644
--- a/models/experimental/functional_segformer/reference/segformer_decode_head.py
+++ b/models/demos/segformer/reference/segformer_decode_head.py
@@ -5,7 +5,7 @@
 import torch
 from torch import nn
 import math
-from models.experimental.functional_segformer.reference.segformer_mlp import SegformerMLP
+from models.demos.segformer.reference.segformer_mlp import SegformerMLP
 from transformers.models.segformer.modeling_segformer import SegformerPreTrainedModel
 
 
@@ -50,6 +50,7 @@ def forward(self, encoder_hidden_states: torch.FloatTensor) -> torch.Tensor:
             encoder_hidden_state = mlp(encoder_hidden_state)
             encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
             encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
+
             # upsample
             encoder_hidden_state = nn.functional.interpolate(
                 encoder_hidden_state, size=encoder_hidden_states[0].size()[2:], mode="bilinear", align_corners=False
diff --git a/models/experimental/functional_segformer/reference/segformer_dwconv.py b/models/demos/segformer/reference/segformer_dwconv.py
similarity index 100%
rename from models/experimental/functional_segformer/reference/segformer_dwconv.py
rename to models/demos/segformer/reference/segformer_dwconv.py
diff --git a/models/experimental/functional_segformer/reference/segformer_efficient_selfattention.py b/models/demos/segformer/reference/segformer_efficient_selfattention.py
similarity index 100%
rename from models/experimental/functional_segformer/reference/segformer_efficient_selfattention.py
rename to models/demos/segformer/reference/segformer_efficient_selfattention.py
diff --git a/models/experimental/functional_segformer/reference/segformer_encoder.py b/models/demos/segformer/reference/segformer_encoder.py
similarity index 95%
rename from models/experimental/functional_segformer/reference/segformer_encoder.py
rename to models/demos/segformer/reference/segformer_encoder.py
index dc07cdebf7a..659b5e14f79 100644
--- a/models/experimental/functional_segformer/reference/segformer_encoder.py
+++ b/models/demos/segformer/reference/segformer_encoder.py
@@ -5,10 +5,10 @@
 import torch
 import torch.nn as nn
 from transformers.modeling_outputs import BaseModelOutput
-from models.experimental.functional_segformer.reference.segformer_overlap_patch_embeddings import (
+from models.demos.segformer.reference.segformer_overlap_patch_embeddings import (
     SegformerOverlapPatchEmbeddings,
 )
-from models.experimental.functional_segformer.reference.segformer_layer import SegformerLayer
+from models.demos.segformer.reference.segformer_layer import SegformerLayer
 from typing import Optional, Tuple, Union
 
 
diff --git a/models/experimental/functional_segformer/reference/segformer_for_semantic_segmentation.py b/models/demos/segformer/reference/segformer_for_semantic_segmentation.py
similarity index 95%
rename from models/experimental/functional_segformer/reference/segformer_for_semantic_segmentation.py
rename to models/demos/segformer/reference/segformer_for_semantic_segmentation.py
index 59b90ffebb4..a311c413840 100644
--- a/models/experimental/functional_segformer/reference/segformer_for_semantic_segmentation.py
+++ b/models/demos/segformer/reference/segformer_for_semantic_segmentation.py
@@ -8,8 +8,8 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
 from transformers.modeling_outputs import SemanticSegmenterOutput
 from transformers.models.segformer.modeling_segformer import SegformerPreTrainedModel
-from models.experimental.functional_segformer.reference.segformer_model import SegformerModelReference
-from models.experimental.functional_segformer.reference.segformer_decode_head import SegformerDecodeHead
+from models.demos.segformer.reference.segformer_model import SegformerModelReference
+from models.demos.segformer.reference.segformer_decode_head import SegformerDecodeHead
 
 
 class SegformerForSemanticSegmentationReference(SegformerPreTrainedModel):
diff --git a/models/experimental/functional_segformer/reference/segformer_layer.py b/models/demos/segformer/reference/segformer_layer.py
similarity index 89%
rename from models/experimental/functional_segformer/reference/segformer_layer.py
rename to models/demos/segformer/reference/segformer_layer.py
index 814815816eb..62ce7212da7 100644
--- a/models/experimental/functional_segformer/reference/segformer_layer.py
+++ b/models/demos/segformer/reference/segformer_layer.py
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import torch.nn as nn
-from models.experimental.functional_segformer.reference.segformer_attention import SegformerAttention
-from models.experimental.functional_segformer.reference.segformer_mixffn import SegformerMixFFN
+from models.demos.segformer.reference.segformer_attention import SegformerAttention
+from models.demos.segformer.reference.segformer_mixffn import SegformerMixFFN
 
 
 class SegformerLayer(nn.Module):
diff --git a/models/experimental/functional_segformer/reference/segformer_mixffn.py b/models/demos/segformer/reference/segformer_mixffn.py
similarity index 91%
rename from models/experimental/functional_segformer/reference/segformer_mixffn.py
rename to models/demos/segformer/reference/segformer_mixffn.py
index 21d2627b206..d8760292a80 100644
--- a/models/experimental/functional_segformer/reference/segformer_mixffn.py
+++ b/models/demos/segformer/reference/segformer_mixffn.py
@@ -4,7 +4,7 @@
 
 import torch.nn as nn
 from transformers.activations import ACT2FN
-from models.experimental.functional_segformer.reference.segformer_dwconv import SegformerDWConv
+from models.demos.segformer.reference.segformer_dwconv import SegformerDWConv
 
 
 class SegformerMixFFN(nn.Module):
diff --git a/models/experimental/functional_segformer/reference/segformer_mlp.py b/models/demos/segformer/reference/segformer_mlp.py
similarity index 100%
rename from models/experimental/functional_segformer/reference/segformer_mlp.py
rename to models/demos/segformer/reference/segformer_mlp.py
diff --git a/models/experimental/functional_segformer/reference/segformer_model.py b/models/demos/segformer/reference/segformer_model.py
similarity index 95%
rename from models/experimental/functional_segformer/reference/segformer_model.py
rename to models/demos/segformer/reference/segformer_model.py
index e24d246d457..fa7e0dc5a7c 100644
--- a/models/experimental/functional_segformer/reference/segformer_model.py
+++ b/models/demos/segformer/reference/segformer_model.py
@@ -7,7 +7,7 @@
 from typing import Union, Tuple, Optional
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.models.segformer.modeling_segformer import SegformerPreTrainedModel
-from models.experimental.functional_segformer.reference.segformer_encoder import SegformerEncoder
+from models.demos.segformer.reference.segformer_encoder import SegformerEncoder
 
 
 class SegformerModelReference(SegformerPreTrainedModel):
diff --git a/models/experimental/functional_segformer/reference/segformer_overlap_patch_embeddings.py b/models/demos/segformer/reference/segformer_overlap_patch_embeddings.py
similarity index 100%
rename from models/experimental/functional_segformer/reference/segformer_overlap_patch_embeddings.py
rename to models/demos/segformer/reference/segformer_overlap_patch_embeddings.py
diff --git a/models/experimental/functional_segformer/reference/segformer_selfoutput.py b/models/demos/segformer/reference/segformer_selfoutput.py
similarity index 100%
rename from models/experimental/functional_segformer/reference/segformer_selfoutput.py
rename to models/demos/segformer/reference/segformer_selfoutput.py
diff --git a/models/experimental/functional_segformer/tt/common.py b/models/demos/segformer/tt/common.py
similarity index 84%
rename from models/experimental/functional_segformer/tt/common.py
rename to models/demos/segformer/tt/common.py
index 85de8856df6..5f52fe0e507 100644
--- a/models/experimental/functional_segformer/tt/common.py
+++ b/models/demos/segformer/tt/common.py
@@ -18,7 +18,7 @@ def __init__(
         height_sharding=True,
         activation="",
         groups=1,
-        dtype=ttnn.bfloat16,
+        dtype=ttnn.bfloat8_b,
     ) -> None:
         self.weights = parameters["weight"]
         self.bias = parameters["bias"]
@@ -73,15 +73,5 @@ def __call__(self, device, input_tensor):
             conv_config=conv_config,
             groups=self.groups,
         )
-        ## TODO: Op | WARNING  | Tensor at index 0 is not allocated
-        # print("sr2a", output_tensor.shape)
-
-        # output_tensor = ttnn.from_device(output_tensor)
-        # output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
-
-        # output_tensor = ttnn.reshape(
-        #     output_tensor, (input_tensor.shape[0], _out_height, _out_width, output_tensor.shape[3])
-        # )
-        # del _out_height, _out_width
 
         return output_tensor, _out_height, _out_width
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_attention.py b/models/demos/segformer/tt/ttnn_segformer_attention.py
similarity index 76%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_attention.py
rename to models/demos/segformer/tt/ttnn_segformer_attention.py
index e37a1a5b706..7ec2aacfc9b 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_attention.py
+++ b/models/demos/segformer/tt/ttnn_segformer_attention.py
@@ -2,10 +2,10 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
-from models.experimental.functional_segformer.tt.ttnn_segformer_efficient_selfattention import (
+from models.demos.segformer.tt.ttnn_segformer_efficient_selfattention import (
     TtSegformerEfficientSelfAttention,
 )
-from models.experimental.functional_segformer.tt.ttnn_segformer_selfoutput import TtSegformerSelfOutput
+from models.demos.segformer.tt.ttnn_segformer_selfoutput import TtSegformerSelfOutput
 import ttnn
 
 
@@ -22,9 +22,8 @@ def __init__(self, hidden_size, num_attention_heads, parameters, sequence_reduct
 
     def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, parameters, output_attentions=False):
         self_outputs = self.self(hidden_states, height, width, parameters.self, output_attentions)
-
         attention_output = self.output(self_outputs[0], parameters.output)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]
         ttnn.deallocate(self_outputs[0])
 
         return outputs
diff --git a/models/demos/segformer/tt/ttnn_segformer_decode_head.py b/models/demos/segformer/tt/ttnn_segformer_decode_head.py
new file mode 100644
index 00000000000..6aed216c578
--- /dev/null
+++ b/models/demos/segformer/tt/ttnn_segformer_decode_head.py
@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+import ttnn
+from models.demos.segformer.tt.ttnn_segformer_mlp import TtSegformerMLP
+from torch import nn
+import tt_lib
+from models.demos.segformer.tt.common import Conv
+from tests.ttnn.ttnn_utility_fuction import get_shard_grid_from_num_cores
+
+
+def torch_to_ttnn(input, device, layout=ttnn.TILE_LAYOUT):
+    input = ttnn.from_torch(input, ttnn.bfloat8_b)
+    input = ttnn.to_layout(input, layout)
+    input = ttnn.to_device(input, device)
+    return input
+
+
+def ttnn_to_torch(input):
+    input = ttnn.to_layout(input, ttnn.ROW_MAJOR_LAYOUT)
+    input = ttnn.from_device(input)
+    input = ttnn.to_torch(input)
+    return input
+
+
+class TtSegformerDecodeHead:
+    def __init__(self, config, parameters):
+        super().__init__()
+        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
+        mlps = []
+        for i in range(config.num_encoder_blocks):
+            mlp = TtSegformerMLP()
+            mlps.append(mlp)
+        self.linear_c = mlps
+
+        self.linear_fuse = Conv([1, 1, 0, 0], parameters=parameters["linear_fuse"], activation="relu", deallocate=False)
+
+        self.classifier = Conv(
+            [1, 1, 0, 0],
+            parameters=parameters["classifier"],
+        )
+
+        self.config = config
+
+    def __call__(self, encoder_hidden_states: ttnn.bfloat8_b, parameters) -> ttnn.Tensor:
+        device = encoder_hidden_states[-1].device()
+        batch_size = encoder_hidden_states[-1].shape[0]
+
+        all_hidden_states = ()
+        concated_tensor = 0
+        index = 0
+        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c):
+            height = width = int(math.sqrt(encoder_hidden_state.shape[-2]))
+            encoder_hidden_state = mlp(encoder_hidden_state, parameters=parameters["linear_c"][index])
+            encoder_hidden_state = ttnn.to_layout(encoder_hidden_state, layout=ttnn.ROW_MAJOR_LAYOUT)
+            encoder_hidden_state = ttnn.reshape(encoder_hidden_state, (batch_size, height, width, -1))
+
+            if encoder_hidden_state.shape[-2] == 16:
+                ncores = 16
+            elif encoder_hidden_state.shape[-2] == 32:
+                ncores = 32
+            else:
+                ncores = 64
+
+            shard_grid = get_shard_grid_from_num_cores(ncores, device)
+            shard_orientation = ttnn.ShardOrientation.ROW_MAJOR
+            shard_height = math.ceil(
+                encoder_hidden_state.shape[0] * encoder_hidden_state.shape[1] * encoder_hidden_state.shape[2] / ncores
+            )
+            shard_width = encoder_hidden_state.shape[3]
+            shard_spec = ttnn.ShardSpec(shard_grid, (shard_height, shard_width), shard_orientation, False)
+            input_memory_config = ttnn.MemoryConfig(
+                ttnn.types.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.types.BufferType.L1, shard_spec
+            )
+            encoder_hidden_state = ttnn.to_memory_config(encoder_hidden_state, memory_config=input_memory_config)
+
+            encoder_hidden_state = ttnn.upsample(
+                encoder_hidden_state,
+                scale_factor=(128 // encoder_hidden_state.shape[2], 128 // encoder_hidden_state.shape[2], 1),
+                mode="bilinear",
+            )
+
+            encoder_hidden_state_to_concat = ttnn.to_memory_config(
+                encoder_hidden_state, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16
+            )
+
+            ttnn.deallocate(encoder_hidden_state)
+            encoder_hidden_state_to_concat = ttnn.reallocate(encoder_hidden_state_to_concat)
+
+            all_hidden_states += (encoder_hidden_state_to_concat,)
+
+            index += 1
+
+        concated_tensor = ttnn.concat(all_hidden_states[::-1], dim=3, memory_config=ttnn.L1_MEMORY_CONFIG)
+        ttnn.deallocate(all_hidden_states[0])
+        ttnn.deallocate(all_hidden_states[1])
+        ttnn.deallocate(all_hidden_states[2])
+        ttnn.deallocate(all_hidden_states[3])
+        concated_tensor = ttnn.reallocate(concated_tensor)
+
+        concated_tensor_tile = ttnn.to_layout(concated_tensor, ttnn.TILE_LAYOUT, dtype=ttnn.bfloat8_b)
+        ttnn.deallocate(concated_tensor)
+        concated_tensor_tile = ttnn.reallocate(concated_tensor_tile)
+
+        hidden_states, __, __ = self.linear_fuse(device, concated_tensor_tile)
+        logits, __, __ = self.classifier(device, hidden_states)
+
+        return logits
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_dwconv.py b/models/demos/segformer/tt/ttnn_segformer_dwconv.py
similarity index 71%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_dwconv.py
rename to models/demos/segformer/tt/ttnn_segformer_dwconv.py
index ad0e43dd42e..d63563d760a 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_dwconv.py
+++ b/models/demos/segformer/tt/ttnn_segformer_dwconv.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
-from models.experimental.functional_segformer.tt.common import Conv
+from models.demos.segformer.tt.common import Conv
 
 
 class TtSegformerDWConv:
@@ -37,8 +37,6 @@ def __call__(
         width: int,
         device,
     ):
-        # print("dw0", hidden_states.shape)
-
         if len(hidden_states.shape) == 3:
             batch_size, seq_len, num_channels = hidden_states.shape
         elif len(hidden_states.shape) == 4:
@@ -47,15 +45,6 @@ def __call__(
         hidden_states = ttnn.to_layout(hidden_states, ttnn.ROW_MAJOR_LAYOUT)
         hidden_states = ttnn.reshape(hidden_states, (batch_size, height, width, num_channels))
 
-        # print("dw1", hidden_states.shape)
         hidden_states = self.dwconv(device, hidden_states)
-        # print("dw2", hidden_states.shape)
-        # hidden_states = ttnn.reshape(
-        #     hidden_states,
-        #     (hidden_states.shape[0], hidden_states.shape[1] * hidden_states.shape[2], hidden_states.shape[3]),
-        # )
-        # hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.TILE_LAYOUT)
-        # hidden_states = ttnn.to_device(hidden_states, device=device)
-        # print("dw3", hidden_states.shape)
 
         return hidden_states
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_efficient_selfattention.py b/models/demos/segformer/tt/ttnn_segformer_efficient_selfattention.py
similarity index 61%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_efficient_selfattention.py
rename to models/demos/segformer/tt/ttnn_segformer_efficient_selfattention.py
index b4dbade4a3d..b2d9e2f8fb2 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_efficient_selfattention.py
+++ b/models/demos/segformer/tt/ttnn_segformer_efficient_selfattention.py
@@ -4,7 +4,7 @@
 
 import ttnn
 import math
-from models.experimental.functional_segformer.tt.common import Conv
+from models.demos.segformer.tt.common import Conv
 
 
 class TtSegformerEfficientSelfAttention:
@@ -25,31 +25,6 @@ def __init__(self, hidden_size, num_attention_heads, parameters, sequence_reduct
         if sequence_reduction_ratio > 1:
             self.sr = Conv([sequence_reduction_ratio, sequence_reduction_ratio, 0, 0], parameters["sr"])
 
-    def transpose_for_scores(self, hidden_states):
-        # new_shape = tuple(hidden_states.shape)[:-1] + (self.num_attention_heads, self.attention_head_size)
-        new_shape = (
-            hidden_states.shape[0],
-            hidden_states.shape[-2],
-            self.num_attention_heads,
-            self.attention_head_size,
-        )
-        device = hidden_states.device()
-        hidden_states = ttnn.from_device(hidden_states)
-        hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.ROW_MAJOR_LAYOUT)
-        hidden_states = ttnn.reshape(hidden_states, new_shape)
-        hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.TILE_LAYOUT)
-        hidden_states = ttnn.to_device(hidden_states, device)
-
-        if len(hidden_states.shape) == 5:
-            output = ttnn.permute(hidden_states, (0, 1, 3, 2, 4))
-        elif len(hidden_states.shape) == 4:
-            output = ttnn.permute(hidden_states, (0, 2, 1, 3))
-        if len(hidden_states.shape) == 3:
-            output = ttnn.permute(hidden_states, (0, 2, 1))
-        ttnn.deallocate(hidden_states)
-
-        return output
-
     def __call__(
         self,
         hidden_states: ttnn.Tensor,
@@ -60,12 +35,17 @@ def __call__(
     ):
         device = hidden_states.device()
 
+        if len(hidden_states.shape) == 4:
+            batch_size, __, seq_len, hidden_size = hidden_states.shape
+        elif len(hidden_states.shape) == 3:
+            batch_size, seq_len, hidden_size = hidden_states.shape
+
         mm_a_x_strategy = ttnn.ShardStrategy.HEIGHT
         mm_a_x_memory_config = ttnn.L1_HEIGHT_SHARDED_MEMORY_CONFIG
         mm_d_x_strategy = mm_a_x_strategy
         mm_d_x_memory_config = mm_a_x_memory_config
         mm_a_y = 8
-        if (hidden_states.shape[-2] == 256) and (hidden_states.shape[-1] == 256):
+        if (seq_len == 256) and (hidden_size == 256):
             mm_a_x = 8
             mm_b_x = 8
             mm_d_x = 2
@@ -73,7 +53,7 @@ def __call__(
             mm_e_x = 8
             mm_a_x_strategy = ttnn.ShardStrategy.BLOCK
             mm_a_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG
-        elif (hidden_states.shape[-2] == 1024) and (hidden_states.shape[-1] == 160):
+        elif (seq_len == 1024) and (hidden_size == 160):
             mm_a_x = 5
             mm_b_x = 5
             mm_d_x = 5
@@ -81,21 +61,19 @@ def __call__(
             mm_e_x = 8
             mm_a_x_strategy = ttnn.ShardStrategy.BLOCK
             mm_a_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG
-        elif (hidden_states.shape[-2] == 4096) and (hidden_states.shape[-1] == 64):
+        elif (seq_len == 4096) and (hidden_size == 64):
             mm_a_x = 8  # 8
             mm_b_x = 1  # 1
             mm_d_x = 4
             mm_d_y = 8
             mm_e_x = 8
-        elif (hidden_states.shape[-2] == 16384) and (hidden_states.shape[-1] == 32):
+        elif (seq_len == 16384) and (hidden_size == 32):
             mm_a_x = 8  # 8
             mm_b_x = 1  # 1
-            mm_d_x = 4
+            mm_d_x = 8
             mm_d_y = 8
             mm_e_x = 8
 
-        # print("mm-1--", hidden_states.shape, parameters.query.weight.shape)
-
         hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT)
         hidden_states = ttnn.to_memory_config(
             hidden_states,
@@ -116,33 +94,31 @@ def __call__(
             dtype=ttnn.bfloat8_b,
         )
 
-        # print("Q1", query.shape)
-        hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
-        query = ttnn.to_memory_config(query, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
+        query = ttnn.to_memory_config(query, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
         # Split Heads
         if self.num_attention_heads == 1:
             query_layer = query
         else:
-            query_layer = self.transpose_for_scores(query)
-
-        # print("Q2", query_layer.shape)
+            query_layer = ttnn.experimental.nlp_create_qkv_heads_segformer(query, memory_config=ttnn.L1_MEMORY_CONFIG)[
+                0
+            ]
 
-        # print("sr0", hidden_states.shape)
         if self.sr_ratio > 1:
             if len(hidden_states.shape) == 3:
                 batch_size, seq_len, num_channels = hidden_states.shape
             elif len(hidden_states.shape) == 4:
                 batch_size, __, seq_len, num_channels = hidden_states.shape
 
+            # Need for RM input to reshape, then back to TILE after that
+            hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
             hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.ROW_MAJOR_LAYOUT)
             hidden_states = ttnn.reshape(hidden_states, (batch_size, height, width, num_channels))
+            hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT)
+            hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
 
-            # print("sr1", hidden_states.shape)
             hidden_states, __, __ = self.sr(device, hidden_states)
-            # print("sr2", hidden_states.shape)
             hidden_states = ttnn.to_memory_config(hidden_states, memory_config=ttnn.L1_MEMORY_CONFIG)
 
-            # print("sr3", hidden_states.shape)
             hidden_states = ttnn.layer_norm(
                 hidden_states,
                 weight=parameters.layer_norm.weight,
@@ -150,8 +126,6 @@ def __call__(
                 memory_config=ttnn.L1_MEMORY_CONFIG,
             )
 
-        # print("mm-2--", hidden_states.shape, parameters.key.weight.shape)
-
         hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT)
         hidden_states = ttnn.to_memory_config(
             hidden_states,
@@ -171,18 +145,14 @@ def __call__(
             core_grid=ttnn.CoreGrid(y=mm_a_y, x=mm_b_x),
             dtype=ttnn.bfloat8_b,
         )
-        # hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
 
-        # print("K1", key.shape)
-        key = ttnn.to_memory_config(key, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
+        key = ttnn.to_memory_config(key, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
         if self.num_attention_heads == 1:
             key_layer = key
         else:
-            key_layer = self.transpose_for_scores(key)
+            key_layer = ttnn.experimental.nlp_create_qkv_heads_segformer(key, memory_config=ttnn.L1_MEMORY_CONFIG)[0]
         key_layer = ttnn.permute(key_layer, (0, 1, 3, 2))
-        # print("K2", key_layer.shape)
 
-        # print("mm-3--", hidden_states.shape, parameters.value.weight.shape)
         value = ttnn.linear(
             hidden_states,
             parameters.value.weight,
@@ -191,17 +161,14 @@ def __call__(
             core_grid=ttnn.CoreGrid(y=mm_a_y, x=mm_b_x),
             dtype=ttnn.bfloat8_b,
         )
-        # hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
         ttnn.deallocate(hidden_states)
-        # print("V1", value.shape)
-        value = ttnn.to_memory_config(value, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
+        value = ttnn.to_memory_config(value, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
         if self.num_attention_heads == 1:
             value_layer = value
         else:
-            value_layer = self.transpose_for_scores(value)
-        # print("V2", value_layer.shape)
-
-        # print("mm-4--", query_layer.shape, key_layer.shape)
+            value_layer = ttnn.experimental.nlp_create_qkv_heads_segformer(value, memory_config=ttnn.L1_MEMORY_CONFIG)[
+                0
+            ]
 
         key_layer = ttnn.to_memory_config(key_layer, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
         query_layer = ttnn.to_layout(query_layer, ttnn.TILE_LAYOUT)
@@ -225,26 +192,14 @@ def __call__(
 
         ttnn.deallocate(query_layer)
         ttnn.deallocate(key_layer)
-        attention_scores = ttnn.to_memory_config(attention_scores, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
 
-        denominator_value = ttnn.ones(
-            attention_scores.shape, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG
-        )
-        denominator_value = denominator_value * math.sqrt(self.attention_head_size)
-        denominator_value = ttnn.reciprocal(denominator_value)
-        attention_scores = attention_scores * denominator_value
-
-        # Normalize the attention scores to probabilities.
+        attention_scores = ttnn.to_memory_config(attention_scores, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
+        scale_value = self.attention_head_size**-0.5
+        attention_scores = ttnn.multiply(attention_scores, scale_value)
         attention_probs = ttnn.softmax(attention_scores, dim=-1, memory_config=ttnn.L1_MEMORY_CONFIG)
         ttnn.deallocate(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        # attention_probs = self.dropout(attention_probs)
-
-        # print("mm-5--", attention_probs.shape, value_layer.shape)
-
         attention_probs = ttnn.to_layout(attention_probs, ttnn.TILE_LAYOUT)
+
         value_layer = ttnn.to_memory_config(value_layer, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
 
         attention_probs = ttnn.to_memory_config(
@@ -262,8 +217,6 @@ def __call__(
             value_layer,
             memory_config=mm_d_x_memory_config,
             dtype=ttnn.bfloat8_b,
-            # core_grid=ttnn.CoreGrid(y=8, x=8),
-            # program_config=ATTN_SCORE_MM_PROGCFG,
         )
         ttnn.deallocate(value)
         ttnn.deallocate(value_layer)
@@ -271,23 +224,13 @@ def __call__(
         if not output_attentions:
             ttnn.deallocate(attention_probs)
         else:
-            attention_probs = ttnn.to_memory_config(attention_probs, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
-        # context_layer = ttnn.to_memory_config(context_layer, ttnn.DRAM_MEMORY_CONFIG, dtype=ttnn.bfloat16)
-        context_layer = ttnn.to_memory_config(context_layer, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
-        # print("cxt1", context_layer.shape)
+            attention_probs = ttnn.to_memory_config(attention_probs, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
 
         if self.num_attention_heads > 1:
-            context_layer = ttnn.permute(context_layer, (0, 2, 1, 3))
-            context_layer = ttnn.to_memory_config(
-                context_layer, ttnn.L1_MEMORY_CONFIG
-            )  # This throws OOM issue while runnning whole_model so, DRAM memory config is used.
-            new_context_layer_shape = tuple(context_layer.shape)[:-2] + (self.all_head_size,)
-            context_layer = ttnn.from_device(context_layer)
-            context_layer = ttnn.to_layout(context_layer, layout=ttnn.ROW_MAJOR_LAYOUT)
-            context_layer = ttnn.reshape(context_layer, new_context_layer_shape)
-            context_layer = ttnn.to_device(context_layer, device)
-            context_layer = ttnn.to_layout(context_layer, layout=ttnn.TILE_LAYOUT)
-        # print("cxt2", context_layer.shape)
+            context_layer = ttnn.to_memory_config(context_layer, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
+            context_layer = ttnn.experimental.nlp_concat_heads(
+                context_layer, memory_config=ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1)
+            )
 
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_encoder.py b/models/demos/segformer/tt/ttnn_segformer_encoder.py
similarity index 84%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_encoder.py
rename to models/demos/segformer/tt/ttnn_segformer_encoder.py
index 66cfe272727..2aecb8c263a 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_encoder.py
+++ b/models/demos/segformer/tt/ttnn_segformer_encoder.py
@@ -3,10 +3,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
-from models.experimental.functional_segformer.tt.ttnn_segformer_overlap_patch_embeddings import (
+from models.demos.segformer.tt.ttnn_segformer_overlap_patch_embeddings import (
     TtSegformerOverlapPatchEmbeddings,
 )
-from models.experimental.functional_segformer.tt.ttnn_segformer_layer import TtSegformerLayer
+from models.demos.segformer.tt.ttnn_segformer_layer import TtSegformerLayer
 from typing import Optional, Tuple, Union
 
 from dataclasses import dataclass
@@ -119,26 +119,18 @@ def __call__(
                     math_fidelity=ttnn.MathFidelity.LoFi,
                 ),
             )
-
             # pass the unflolded version to the Decoder
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            # print("e1", hidden_states.shape)
-            # fourth, optionally reshape back to (batch_size, num_channels, height, width)
-            # TODO: does the input to Conv need to be 4D?
-            if idx != len(self.patch_embeddings) - 1 or (
-                idx == len(self.patch_embeddings) - 1 and self.config.reshape_last_stage
-            ):
+            if idx != len(self.patch_embeddings) - 1:
+                hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.ROW_MAJOR_LAYOUT)
+                hidden_states = ttnn.reshape(hidden_states, (batch_size, height, width, -1))
+            elif idx == len(self.patch_embeddings) - 1 and self.config.reshape_last_stage:
+                # last stage
                 hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.ROW_MAJOR_LAYOUT)
                 hidden_states = ttnn.reshape(hidden_states, (batch_size, height, width, -1))
-                hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.TILE_LAYOUT)
-                hidden_states = ttnn.permute(hidden_states, (0, 3, 1, 2))
 
-            # Original folded version is passed to the Decoder
-            # if output_hidden_states:
-            #    all_hidden_states = all_hidden_states + (hidden_states,)
-            # print("e2", hidden_states.shape)
         if not return_dict:
             return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
         return TtBaseModelOutput(
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_for_semantic_segmentation.py b/models/demos/segformer/tt/ttnn_segformer_for_semantic_segmentation.py
similarity index 91%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_for_semantic_segmentation.py
rename to models/demos/segformer/tt/ttnn_segformer_for_semantic_segmentation.py
index e5b889fcab6..09225c51a1b 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_for_semantic_segmentation.py
+++ b/models/demos/segformer/tt/ttnn_segformer_for_semantic_segmentation.py
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
-from models.experimental.functional_segformer.tt.ttnn_segformer_decode_head import TtSegformerDecodeHead
-from models.experimental.functional_segformer.tt.ttnn_segformer_model import TtSegformerModel
+from models.demos.segformer.tt.ttnn_segformer_decode_head import TtSegformerDecodeHead
+from models.demos.segformer.tt.ttnn_segformer_model import TtSegformerModel
 from typing import Tuple, Union, Optional
 from dataclasses import dataclass
 
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_layer.py b/models/demos/segformer/tt/ttnn_segformer_layer.py
similarity index 92%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_layer.py
rename to models/demos/segformer/tt/ttnn_segformer_layer.py
index 983c935011e..77234293988 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_layer.py
+++ b/models/demos/segformer/tt/ttnn_segformer_layer.py
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
-from models.experimental.functional_segformer.tt.ttnn_segformer_attention import TtSegformerAttention
-from models.experimental.functional_segformer.tt.ttnn_segformer_mix_ffn import TtSegformerMixFFN
+from models.demos.segformer.tt.ttnn_segformer_attention import TtSegformerAttention
+from models.demos.segformer.tt.ttnn_segformer_mix_ffn import TtSegformerMixFFN
 
 
 class TtSegformerLayer:
@@ -37,7 +37,6 @@ def __call__(
             parameters.attention,
             output_attentions=output_attentions,
         )
-
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_mix_ffn.py b/models/demos/segformer/tt/ttnn_segformer_mix_ffn.py
similarity index 73%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_mix_ffn.py
rename to models/demos/segformer/tt/ttnn_segformer_mix_ffn.py
index 1a6ec3038e6..4cbc54c8cab 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_mix_ffn.py
+++ b/models/demos/segformer/tt/ttnn_segformer_mix_ffn.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
-from models.experimental.functional_segformer.tt.ttnn_segformer_dwconv import TtSegformerDWConv
+from models.demos.segformer.tt.ttnn_segformer_dwconv import TtSegformerDWConv
 
 
 class TtSegformerMixFFN:
@@ -12,25 +12,28 @@ def __init__(self, parameters, hidden_features):
         self.dwconv = TtSegformerDWConv(parameters["dwconv"], hidden_features)
 
     def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, parameters, device):
-        # print("mm-7--", hidden_states.shape, parameters.dense1.weight.shape)
+        if len(hidden_states.shape) == 4:
+            batch_size, __, seq_len, hidden_size = hidden_states.shape
+        elif len(hidden_states.shape) == 3:
+            batch_size, seq_len, hidden_size = hidden_states.shape
 
         mm_f_x_strategy = ttnn.ShardStrategy.HEIGHT
         mm_f_x_memory_config = ttnn.L1_HEIGHT_SHARDED_MEMORY_CONFIG
         mm_f_y = 8
-        if (hidden_states.shape[-2] == 256) and (hidden_states.shape[-1] == 256):
-            mm_f_x = 4
+        if (seq_len == 256) and (hidden_size == 256):
+            mm_f_x = 8
             mm_f_x_strategy = ttnn.ShardStrategy.BLOCK
             mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG
-        elif (hidden_states.shape[-2] == 1024) and (hidden_states.shape[-1] == 160):
+        elif (seq_len == 1024) and (hidden_size == 160):
             mm_f_x = 5
             mm_f_x_strategy = ttnn.ShardStrategy.BLOCK
             mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG
-        elif (hidden_states.shape[-2] == 4096) and (hidden_states.shape[-1] == 64):
+        elif (seq_len == 4096) and (hidden_size == 64):
             mm_f_x = 2
             mm_f_x_strategy = ttnn.ShardStrategy.BLOCK
             mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG
-        elif (hidden_states.shape[-2] == 16384) and (hidden_states.shape[-1] == 32):
-            mm_f_x = 4
+        elif (seq_len == 16384) and (hidden_size == 32):
+            mm_f_x = 8
 
         hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT)
 
@@ -55,12 +58,8 @@ def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, paramete
 
         hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
         hidden_states, __, __ = self.dwconv(hidden_states, height, width, device)
-        hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
-        # TODO: GeLU on sharded data
-        hidden_states = ttnn.gelu(hidden_states, memory_config=ttnn.L1_MEMORY_CONFIG)
-
-        # print("mm-8--", hidden_states.shape, parameters.dense2.weight.shape)
-        hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT)
+        # # TODO: GeLU on sharded data
+        hidden_states = ttnn.gelu(hidden_states)
 
         hidden_states = ttnn.to_memory_config(
             hidden_states,
@@ -70,7 +69,6 @@ def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, paramete
                 strategy=mm_f_x_strategy,
                 orientation=ttnn.ShardOrientation.ROW_MAJOR,
             ),
-            dtype=ttnn.bfloat8_b,
         )
         hidden_states = ttnn.linear(
             hidden_states,
@@ -81,5 +79,5 @@ def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, paramete
             dtype=ttnn.bfloat8_b,
         )
 
-        hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
+        hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
         return hidden_states
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_mlp.py b/models/demos/segformer/tt/ttnn_segformer_mlp.py
similarity index 79%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_mlp.py
rename to models/demos/segformer/tt/ttnn_segformer_mlp.py
index da1efbde00e..34a8f7efeac 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_mlp.py
+++ b/models/demos/segformer/tt/ttnn_segformer_mlp.py
@@ -57,23 +57,6 @@ def __init__(self):
     def __call__(self, hidden_states: ttnn.Tensor, parameters):
         device = hidden_states.device()
 
-        if 0:
-            # print("mlp0", hidden_states.shape)
-            hidden_states = ttnn.from_device(hidden_states)
-            hidden_states = ttnn.to_layout(hidden_states, ttnn.ROW_MAJOR_LAYOUT)
-            hidden_states = ttnn.reshape(
-                hidden_states,
-                (hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[2] * hidden_states.shape[3]),
-            )
-            hidden_states = ttnn.to_device(hidden_states, device=device)
-            hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT)
-            # print("mlp1", hidden_states.shape)
-            hidden_states = ttnn.permute(hidden_states, (0, 2, 1))
-            # print("mlp2", hidden_states.shape)
-            if len(hidden_states.shape) == 2:  # This is due to while permuting 1,x,y we are getting 2D as output
-                hidden_states = ttnn.reshape(hidden_states, (1, hidden_states.shape[0], hidden_states.shape[1]))
-            # print("mlp3", hidden_states.shape)
-
         mm_f_x_strategy = ttnn.ShardStrategy.HEIGHT
         mm_f_x_memory_config = ttnn.L1_HEIGHT_SHARDED_MEMORY_CONFIG
         mm_f_y = 8
@@ -95,7 +78,6 @@ def __call__(self, hidden_states: ttnn.Tensor, parameters):
             mm_prog_config = program_configs["linear_config_16384"]
 
         hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT)
-        # print("---mlp MM----", hidden_states.shape, parameters.proj.weight.shape)
 
         if (hidden_states.shape[-2] == 1024) and (hidden_states.shape[-1] == 160):
             # TODO: convert it to sharding
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_model.py b/models/demos/segformer/tt/ttnn_segformer_model.py
similarity index 95%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_model.py
rename to models/demos/segformer/tt/ttnn_segformer_model.py
index ff0c711d325..d325895ce8f 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_model.py
+++ b/models/demos/segformer/tt/ttnn_segformer_model.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
-from models.experimental.functional_segformer.tt.ttnn_segformer_encoder import TtSegformerEncoder
+from models.demos.segformer.tt.ttnn_segformer_encoder import TtSegformerEncoder
 from typing import Optional, Tuple, Union
 from dataclasses import dataclass
 
diff --git a/models/demos/segformer/tt/ttnn_segformer_overlap_patch_embeddings.py b/models/demos/segformer/tt/ttnn_segformer_overlap_patch_embeddings.py
new file mode 100644
index 00000000000..c81bfad5569
--- /dev/null
+++ b/models/demos/segformer/tt/ttnn_segformer_overlap_patch_embeddings.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import ttnn
+from models.demos.segformer.tt.common import Conv
+
+
+class TtSegformerOverlapPatchEmbeddings:
+    """Construct the overlapping patch embeddings."""
+
+    def __init__(self, parameters, stride, patch_size):
+        super().__init__()
+        self.proj = Conv([stride, stride, patch_size // 2, patch_size // 2], parameters=parameters["proj"])
+
+    def __call__(
+        self,
+        pixel_values: ttnn.Tensor,
+        parameters,
+    ):
+        device = pixel_values.device()
+
+        if pixel_values.shape[-1] == 3:
+            pixel_values_rm = ttnn.from_device(pixel_values)
+            pixel_values_rm = ttnn.to_layout(pixel_values_rm, layout=ttnn.ROW_MAJOR_LAYOUT)
+        else:
+            pixel_values_rm = pixel_values
+
+        embeddings, input_height, input_width = self.proj(device, pixel_values_rm)
+        embeddings = ttnn.to_memory_config(embeddings, memory_config=ttnn.L1_MEMORY_CONFIG)
+        ttnn.deallocate(pixel_values)
+        embeddings = ttnn.reallocate(embeddings)
+
+        embeddings = ttnn.layer_norm(
+            embeddings,
+            weight=parameters.layer_norm.weight,
+            bias=parameters.layer_norm.bias,
+            memory_config=ttnn.L1_MEMORY_CONFIG,
+            compute_kernel_config=ttnn.WormholeComputeKernelConfig(
+                math_fidelity=ttnn.MathFidelity.LoFi,
+            ),
+        )
+
+        return embeddings, input_height, input_width
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_selfoutput.py b/models/demos/segformer/tt/ttnn_segformer_selfoutput.py
similarity index 72%
rename from models/experimental/functional_segformer/tt/ttnn_segformer_selfoutput.py
rename to models/demos/segformer/tt/ttnn_segformer_selfoutput.py
index 22083348634..daf1d89d525 100644
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_selfoutput.py
+++ b/models/demos/segformer/tt/ttnn_segformer_selfoutput.py
@@ -10,23 +10,26 @@ def __init__(self):
         super().__init__()
 
     def __call__(self, hidden_states: ttnn.Tensor, parameters):
-        # print("mm-6--", hidden_states.shape)
+        if len(hidden_states.shape) == 4:
+            batch_size, __, seq_len, hidden_size = hidden_states.shape
+        elif len(hidden_states.shape) == 3:
+            batch_size, seq_len, hidden_size = hidden_states.shape
 
         mm_f_x_strategy = ttnn.ShardStrategy.HEIGHT
         mm_f_x_memory_config = ttnn.L1_HEIGHT_SHARDED_MEMORY_CONFIG
         mm_f_y = 8
-        if (hidden_states.shape[-2] == 256) and (hidden_states.shape[-1] == 256):
-            mm_f_x = 4
+        if (seq_len == 256) and (hidden_size == 256):
+            mm_f_x = 8
             mm_f_x_strategy = ttnn.ShardStrategy.BLOCK
             mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG
-        elif (hidden_states.shape[-2] == 1024) and (hidden_states.shape[-1] == 160):
+        elif (seq_len == 1024) and (hidden_size == 160):
             mm_f_x = 5
             mm_f_x_strategy = ttnn.ShardStrategy.BLOCK
             mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG
-        elif (hidden_states.shape[-2] == 4096) and (hidden_states.shape[-1] == 64):
-            mm_f_x = 4
-        elif (hidden_states.shape[-2] == 16384) and (hidden_states.shape[-1] == 32):
-            mm_f_x = 4
+        elif (seq_len == 4096) and (hidden_size == 64):
+            mm_f_x = 8
+        elif (seq_len == 16384) and (hidden_size == 32):
+            mm_f_x = 8
 
         hidden_states = ttnn.to_memory_config(
             hidden_states,
@@ -47,7 +50,6 @@ def __call__(self, hidden_states: ttnn.Tensor, parameters):
             dtype=ttnn.bfloat8_b,
         )
 
-        hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
-        # print("self", hidden_states.shape)
+        hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
 
         return hidden_states
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_decode_head.py b/models/experimental/functional_segformer/tt/ttnn_segformer_decode_head.py
deleted file mode 100644
index 1a78b2acc02..00000000000
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_decode_head.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import math
-import ttnn
-from models.experimental.functional_segformer.tt.ttnn_segformer_mlp import TtSegformerMLP
-from torch import nn
-import tt_lib
-from models.experimental.functional_segformer.tt.common import Conv
-from tests.ttnn.ttnn_utility_fuction import get_shard_grid_from_num_cores
-
-
-def torch_to_ttnn(input, device, layout=ttnn.TILE_LAYOUT):
-    input = ttnn.from_torch(input, ttnn.bfloat16)
-    input = ttnn.to_layout(input, layout)
-    input = ttnn.to_device(input, device)
-    return input
-
-
-def ttnn_to_torch(input):
-    input = ttnn.to_layout(input, ttnn.ROW_MAJOR_LAYOUT)
-    input = ttnn.from_device(input)
-    input = ttnn.to_torch(input)
-    return input
-
-
-class TtSegformerDecodeHead:
-    def __init__(self, config, parameters):
-        super().__init__()
-        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
-        mlps = []
-        for i in range(config.num_encoder_blocks):
-            mlp = TtSegformerMLP()
-            mlps.append(mlp)
-        self.linear_c = mlps
-
-        self.linear_fuse = Conv(
-            [1, 1, 0, 0],
-            parameters=parameters["linear_fuse"],
-            activation="relu",
-        )
-
-        self.classifier = Conv(
-            [1, 1, 0, 0],
-            parameters=parameters["classifier"],
-        )
-
-        self.config = config
-
-    def __call__(self, encoder_hidden_states: ttnn.bfloat16, parameters) -> ttnn.Tensor:
-        device = encoder_hidden_states[-1].device()
-        batch_size = encoder_hidden_states[-1].shape[0]
-
-        all_hidden_states = ()
-        index = 0
-        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c):
-            # This condition was the original PyTorch graph with the redundant TMs to unfold/fold/unfold
-            if 0:
-                if self.config.reshape_last_stage is False and (encoder_hidden_state.shape) == 3:
-                    height = width = int(math.sqrt(encoder_hidden_state.shape[-1]))
-                    encoder_hidden_state = ttnn.reshape(encoder_hidden_state, (batch_size, height, width, -1))
-                    encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 3, 1, 2))
-                height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
-            else:
-                height = width = int(math.sqrt(encoder_hidden_state.shape[-2]))
-
-            # print("iii", index)
-            # print("ee0", encoder_hidden_state.shape)
-            encoder_hidden_state = mlp(encoder_hidden_state, parameters=parameters["linear_c"][index])
-
-            # print("ee1", encoder_hidden_state.shape)
-            encoder_hidden_state = ttnn.to_layout(encoder_hidden_state, layout=ttnn.ROW_MAJOR_LAYOUT)
-            encoder_hidden_state = ttnn.reshape(encoder_hidden_state, (batch_size, height, width, -1))
-
-            ## Original TM but I replaced it with one reshape in the previous line.
-            ## To revisit to investigate on the low PCC
-            """
-            if len(encoder_hidden_state.shape) == 3:
-                encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 2, 1))
-            else:
-                encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 1, 3, 2))
-            print("ee2", encoder_hidden_state.shape)
-            encoder_hidden_state = ttnn.from_device(encoder_hidden_state)
-            encoder_hidden_state = ttnn.to_layout(encoder_hidden_state, layout=ttnn.ROW_MAJOR_LAYOUT)
-            encoder_hidden_state = ttnn.reshape(encoder_hidden_state, (batch_size, -1, height, width))
-            encoder_hidden_state = ttnn.to_device(encoder_hidden_state, device)
-            print("ee3", encoder_hidden_state.shape)
-            encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 2, 3, 1))
-            print("ee4", encoder_hidden_state.shape)
-            """
-
-            if encoder_hidden_state.shape[-2] == 16:
-                ncores = 16
-            elif encoder_hidden_state.shape[-2] == 32:
-                ncores = 32
-            else:
-                ncores = 64
-
-            shard_grid = get_shard_grid_from_num_cores(ncores, device)
-            shard_orientation = ttnn.ShardOrientation.ROW_MAJOR
-
-            shard_height = math.ceil(
-                encoder_hidden_state.shape[0] * encoder_hidden_state.shape[1] * encoder_hidden_state.shape[2] / ncores
-            )
-            shard_width = encoder_hidden_state.shape[3]
-            shard_spec = ttnn.ShardSpec(shard_grid, (shard_height, shard_width), shard_orientation, False)
-            input_memory_config = ttnn.MemoryConfig(
-                ttnn.types.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.types.BufferType.L1, shard_spec
-            )
-            encoder_hidden_state = ttnn.to_memory_config(encoder_hidden_state, memory_config=input_memory_config)
-
-            # print(shard_spec)
-            # print("up", encoder_hidden_state.shape)
-            encoder_hidden_state = ttnn.upsample(
-                encoder_hidden_state,
-                scale_factor=(128 // encoder_hidden_state.shape[2], 128 // encoder_hidden_state.shape[2], 1),
-                mode="bilinear",
-            )
-
-            encoder_hidden_state = ttnn.to_memory_config(
-                encoder_hidden_state, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16
-            )
-            # print("up2", encoder_hidden_state.shape)
-            # encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 3, 1, 2))
-            # print("up3", encoder_hidden_state.shape)
-
-            # all_hidden_states += (encoder_hidden_state,)
-            # encoder_hidden_state = ttnn.to_layout(encoder_hidden_state, ttnn.TILE_LAYOUT)
-            if index == 0:
-                concated_tensor = encoder_hidden_state
-            else:
-                concated_tensor = ttnn.concat(
-                    [concated_tensor, encoder_hidden_state], dim=3, memory_config=ttnn.L1_MEMORY_CONFIG
-                )
-                ttnn.deallocate(encoder_hidden_state)
-                ttnn.reallocate(concated_tensor)
-            # print("conc", index, concated_tensor.shape)
-            index += 1
-
-        # Replaced with the phased concat of each 2 inputs, to handle the L1 OOM
-        ###concated_tensor = ttnn.concat(all_hidden_states[::-1], dim=1, memory_config=ttnn.L1_MEMORY_CONFIG)
-
-        # ttnn.deallocate(encoder_hidden_state)
-        # concated_tensor = ttnn.to_memory_config(concated_tensor, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b)
-        # ttnn.reallocate(concated_tensor)
-
-        # print("conc", concated_tensor.shape)
-
-        # concated_tensor = ttnn.to_layout(concated_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
-        # concated_tensor = ttnn.permute(concated_tensor, (0, 3, 1, 2))
-        # concated_tensor = ttnn.reshape(concated_tensor, (concated_tensor[0], concated_tensor[-1], concated_tensor[1], concated_tensor[2]) )
-
-        ttnn.reallocate(concated_tensor)
-        # print("conc_p", concated_tensor.shape)
-
-        hidden_states, __, __ = self.linear_fuse(device, concated_tensor)
-        # print("c1", hidden_states.shape)
-
-        logits, __, __ = self.classifier(device, hidden_states)
-        logits_shape = logits.shape
-        logits = ttnn.to_device(logits, device=device)
-        logits = ttnn.to_memory_config(logits, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16)
-        logits = ttnn.to_layout(logits, layout=ttnn.ROW_MAJOR_LAYOUT)
-        # print("logits", logits.shape)
-        h = w = int(math.sqrt(logits.shape[-2]))
-        logits = ttnn.reshape(logits, (logits_shape[0], h, w, logits_shape[-1]))
-        # print(logits.shape)
-        logits = ttnn.permute(logits, (0, 3, 1, 2))
-        logits = ttnn.to_layout(logits, layout=ttnn.ROW_MAJOR_LAYOUT)
-        logits = logits[:, :150, :, :]  # returns out_channel 160 instead of 150
-        logits = ttnn.to_layout(logits, layout=ttnn.TILE_LAYOUT)
-        # logits are of shape (batch_size, num_labels, height/4, width/4)
-
-        return logits
diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_overlap_patch_embeddings.py b/models/experimental/functional_segformer/tt/ttnn_segformer_overlap_patch_embeddings.py
deleted file mode 100644
index d20a3c1e11c..00000000000
--- a/models/experimental/functional_segformer/tt/ttnn_segformer_overlap_patch_embeddings.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import ttnn
-from models.experimental.functional_segformer.tt.common import Conv
-
-
-class TtSegformerOverlapPatchEmbeddings:
-    """Construct the overlapping patch embeddings."""
-
-    def __init__(self, parameters, stride, patch_size):
-        super().__init__()
-        self.proj = Conv([stride, stride, patch_size // 2, patch_size // 2], parameters=parameters["proj"])
-
-    def __call__(
-        self,
-        pixel_values: ttnn.Tensor,
-        parameters,
-    ):
-        # print("ov0", pixel_values.shape)
-        device = pixel_values.device()
-
-        pixel_values = ttnn.permute(pixel_values, (0, 2, 3, 1))
-
-        if pixel_values.shape[3] == 3:
-            pixel_values = ttnn.from_device(pixel_values)
-            pixel_values = ttnn.to_layout(pixel_values, layout=ttnn.ROW_MAJOR_LAYOUT)
-        embeddings, input_height, input_width = self.proj(device, pixel_values)
-        embeddings = ttnn.to_memory_config(embeddings, memory_config=ttnn.L1_MEMORY_CONFIG)
-        ttnn.deallocate(pixel_values)
-        # print("ov1", embeddings.shape)
-        """
-        #embeddings = ttnn.to_layout(embeddings, layout=ttnn.TILE_LAYOUT)
-        embeddings = ttnn.to_layout(embeddings, layout=ttnn.ROW_MAJOR_LAYOUT)
-        embeddings = ttnn.to_device(embeddings, device=device)
-
-        embeddings = ttnn.permute(embeddings, (0, 3, 1, 2))
-        batch_size, _, input_height, input_width = embeddings.shape
-
-        ttnn.deallocate(pixel_values)
-        embeddings = ttnn.from_device(embeddings)
-        embeddings = ttnn.to_layout(embeddings, layout=ttnn.ROW_MAJOR_LAYOUT)
-
-        print("ov2", embeddings.shape)
-        embeddings = ttnn.reshape(
-            embeddings, (embeddings.shape[0], embeddings.shape[1], embeddings.shape[2] * embeddings.shape[3])
-        )
-        embeddings = ttnn.to_layout(embeddings, layout=ttnn.TILE_LAYOUT)
-        embeddings = ttnn.to_device(embeddings, device)
-
-        print("ov3", embeddings.shape)
-
-        embeddings = ttnn.permute(embeddings, (0, 2, 1))
-        if len(embeddings.shape) == 2:
-            embeddings = ttnn.reshape(embeddings, (1, embeddings.shape[0], embeddings.shape[1]))
-
-        print("ov4", embeddings.shape)
-        """
-
-        embeddings = ttnn.layer_norm(
-            embeddings,
-            weight=parameters.layer_norm.weight,
-            bias=parameters.layer_norm.bias,
-            memory_config=ttnn.L1_MEMORY_CONFIG,
-            compute_kernel_config=ttnn.WormholeComputeKernelConfig(
-                math_fidelity=ttnn.MathFidelity.LoFi,
-            ),
-        )
-
-        # print("ov5", embeddings.shape)
-
-        return embeddings, input_height, input_width
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py
new file mode 100644
index 00000000000..0b5dd234fec
--- /dev/null
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py
@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from loguru import logger
+
+from models.utility_functions import tt2torch_tensor, comp_pcc
+from models.utility_functions import is_grayskull
+import torch
+import ttnn
+
+"""
+Segformer shapes + functionality
+"""
+
+
+def run_nlp_create_qkv_heads_segformer_test(batch, seq_len, hidden_dim, dtype, in0_mem_config, out_mem_config, device):
+    torch.manual_seed(1234)
+
+    in0_shape = [batch, 1, seq_len, hidden_dim]
+
+    A = torch.randn(in0_shape)
+
+    in0_t = ttnn.Tensor(A, dtype).to(ttnn.TILE_LAYOUT).to(device, in0_mem_config)
+
+    q = ttnn.experimental.nlp_create_qkv_heads_segformer(in0_t, memory_config=out_mem_config)[0]
+
+    # Check memory of inputs and outputs
+    assert in0_t.memory_config().buffer_type == in0_mem_config.buffer_type
+    assert q.memory_config().buffer_type == out_mem_config.buffer_type
+    logger.debug(f"in0: {in0_t.memory_config().buffer_type} and {in0_t.get_dtype()}")
+    logger.debug(f"q: {q.memory_config().buffer_type} and {q.get_dtype()}")
+
+    head_dim = 32
+    heads_num = hidden_dim // head_dim
+    assert list(q.shape.with_tile_padding()) == [batch, heads_num, seq_len, head_dim]
+
+    pyt_got_back_rm_q = tt2torch_tensor(q)
+
+    ref_q = A
+    # Additional shuffling for Q,K,V heads
+    ref_q = torch.reshape(ref_q, [batch, seq_len, heads_num, head_dim]).transpose(-3, -2)
+
+    if dtype == ttnn.bfloat8_b:
+        pcc = 0.99
+    else:
+        pcc = 1.0
+
+    passing_pcc_q, output_pcc_q = comp_pcc(pyt_got_back_rm_q, ref_q, pcc)
+    logger.debug(f"Q passing={passing_pcc_q}")
+    logger.debug(f"Q output pcc={output_pcc_q}")
+    assert passing_pcc_q
+
+
+@pytest.mark.parametrize(
+    "out_mem_config",
+    (
+        ttnn.DRAM_MEMORY_CONFIG,
+        ttnn.L1_MEMORY_CONFIG,
+    ),
+    ids=["out_DRAM", "out_L1"],
+)
+@pytest.mark.parametrize(
+    "in0_mem_config",
+    (
+        ttnn.DRAM_MEMORY_CONFIG,
+        ttnn.L1_MEMORY_CONFIG,
+    ),
+    ids=["in0_DRAM", "in0_L1"],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    (ttnn.bfloat8_b, ttnn.bfloat16),
+    ids=["BFLOAT8_B", "BFLOAT16"],
+)
+@pytest.mark.parametrize(
+    "batch, seq_len, hidden_dim",
+    ((1, 4096, 64), (1, 1024, 160), (1, 256, 256)),
+    ids=[
+        "batch1_seq4k",
+        "batch1_seq1k",
+        "batch1_seq256",
+    ],
+)
+def test_nlp_create_qkv_heads_segformer_test(
+    batch, seq_len, hidden_dim, dtype, in0_mem_config, out_mem_config, request, device
+):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Skipping float32 tests on Grayskull")
+    run_nlp_create_qkv_heads_segformer_test(batch, seq_len, hidden_dim, dtype, in0_mem_config, out_mem_config, device)
+
+
+def test_nlp_create_qkv_heads_segformer_with_program_cache(device, use_program_cache):
+    dtype = ttnn.bfloat8_b
+    mem_config = ttnn.DRAM_MEMORY_CONFIG
+    for _ in range(2):
+        run_nlp_create_qkv_heads_segformer_test(1, 32, 32, dtype, mem_config, mem_config, device)
+        dummy_shape = [1, 1, 32, 32]
+        py_dummy_tensor = torch.randn(dummy_shape)
+        tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    mem_config = ttnn.L1_MEMORY_CONFIG
+    for _ in range(2):
+        run_nlp_create_qkv_heads_segformer_test(1, 32, 32, dtype, mem_config, mem_config, device)
+        dummy_shape = [1, 1, 32, 32]
+        py_dummy_tensor = torch.randn(dummy_shape)
+        tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py
new file mode 100644
index 00000000000..28d369a3ef9
--- /dev/null
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py
@@ -0,0 +1,123 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from loguru import logger
+
+from models.utility_functions import tt2torch_tensor, comp_pcc
+from models.utility_functions import is_grayskull
+import torch
+import ttnn
+
+"""
+ViT shapes + functionality
+"""
+
+
+def run_nlp_create_qkv_heads_vit_test(batch, seq_len, dtype, in0_mem_config, out_mem_config, device):
+    torch.manual_seed(1234)
+
+    in0_shape = [batch, 1, seq_len, 2304]
+
+    A = torch.randn(in0_shape)
+
+    in0_t = ttnn.Tensor(A, dtype).to(ttnn.TILE_LAYOUT).to(device, in0_mem_config)
+
+    q, k, v = ttnn.experimental.nlp_create_qkv_heads_vit(in0_t, memory_config=out_mem_config)
+
+    # Check memory of inputs and outputs
+    assert in0_t.memory_config().buffer_type == in0_mem_config.buffer_type
+    assert q.memory_config().buffer_type == out_mem_config.buffer_type
+    assert k.memory_config().buffer_type == out_mem_config.buffer_type
+    assert v.memory_config().buffer_type == out_mem_config.buffer_type
+    logger.debug(f"in0: {in0_t.memory_config().buffer_type} and {in0_t.get_dtype()}")
+    logger.debug(f"q: {q.memory_config().buffer_type} and {q.get_dtype()}")
+    logger.debug(f"k: {k.memory_config().buffer_type} and {k.get_dtype()}")
+    logger.debug(f"v: {v.memory_config().buffer_type} and {v.get_dtype()}")
+
+    assert list(q.shape.with_tile_padding()) == [batch, 12, seq_len, 64]
+    assert list(k.shape.with_tile_padding()) == [batch, 12, seq_len, 64]
+    assert list(v.shape.with_tile_padding()) == [batch, 12, seq_len, 64]
+
+    pyt_got_back_rm_q = tt2torch_tensor(q)
+    pyt_got_back_rm_k = tt2torch_tensor(k)
+    pyt_got_back_rm_v = tt2torch_tensor(v)
+
+    (ref_q, ref_k, ref_v) = torch.split(A, [768, 768, 768], dim=-1)
+    # Additional shuffling for Q,K,V heads
+    ref_q = torch.reshape(ref_q, [batch, seq_len, 12, 64]).transpose(-3, -2)
+    ref_k = torch.reshape(ref_k, [batch, seq_len, 12, 64]).transpose(-3, -2)
+    ref_v = torch.reshape(ref_v, [batch, seq_len, 12, 64]).transpose(-3, -2)
+
+    if dtype == ttnn.bfloat8_b:
+        pcc = 0.99
+    else:
+        pcc = 1.0
+
+    passing_pcc_q, output_pcc_q = comp_pcc(pyt_got_back_rm_q, ref_q, pcc)
+    logger.debug(f"Q passing={passing_pcc_q}")
+    logger.debug(f"Q output pcc={output_pcc_q}")
+    assert passing_pcc_q
+    passing_pcc_k, output_pcc_k = comp_pcc(pyt_got_back_rm_k, ref_k, pcc)
+    logger.debug(f"K passing={passing_pcc_k}")
+    logger.debug(f"K output pcc={output_pcc_k}")
+    assert passing_pcc_k
+    passing_pcc_v, output_pcc_v = comp_pcc(pyt_got_back_rm_v, ref_v, pcc)
+    logger.debug(f"V passing={passing_pcc_v}")
+    logger.debug(f"V output pcc={output_pcc_v}")
+    assert passing_pcc_v
+
+
+@pytest.mark.parametrize(
+    "out_mem_config",
+    (
+        ttnn.DRAM_MEMORY_CONFIG,
+        ttnn.L1_MEMORY_CONFIG,
+    ),
+    ids=["out_DRAM", "out_L1"],
+)
+@pytest.mark.parametrize(
+    "in0_mem_config",
+    (
+        ttnn.DRAM_MEMORY_CONFIG,
+        ttnn.L1_MEMORY_CONFIG,
+    ),
+    ids=["in0_DRAM", "in0_L1"],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    (ttnn.bfloat8_b, ttnn.bfloat16),
+    ids=["BFLOAT8_B", "BFLOAT16"],
+)
+@pytest.mark.parametrize(
+    "batch, seq_len",
+    ((1, 224), (1, 4096)),
+    ids=[
+        "batch1_seq224",
+        "batch1_seq4k",
+    ],
+)
+def test_nlp_create_qkv_heads_vit_test(batch, seq_len, dtype, in0_mem_config, out_mem_config, request, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Skipping float32 tests on Grayskull")
+    run_nlp_create_qkv_heads_vit_test(batch, seq_len, dtype, in0_mem_config, out_mem_config, device)
+
+
+def test_nlp_create_qkv_heads_vit_with_program_cache(device, use_program_cache):
+    dtype = ttnn.bfloat8_b
+    mem_config = ttnn.DRAM_MEMORY_CONFIG
+    for _ in range(2):
+        run_nlp_create_qkv_heads_vit_test(1, 32, dtype, mem_config, mem_config, device)
+        dummy_shape = [1, 1, 32, 32]
+        py_dummy_tensor = torch.randn(dummy_shape)
+        tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    mem_config = ttnn.L1_MEMORY_CONFIG
+    for _ in range(2):
+        run_nlp_create_qkv_heads_vit_test(1, 32, dtype, mem_config, mem_config, device)
+        dummy_shape = [1, 1, 32, 32]
+        py_dummy_tensor = torch.randn(dummy_shape)
+        tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config)
+
+    assert device.num_program_cache_entries() == 2
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_attention.py b/tests/ttnn/integration_tests/segformer/test_segformer_attention.py
index a2e35af0a46..4cc714506cb 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_attention.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_attention.py
@@ -11,10 +11,10 @@
 import pytest
 from tests.ttnn.utils_for_testing import assert_with_pcc
 from models.utility_functions import skip_for_grayskull
-from models.experimental.functional_segformer.tt.ttnn_segformer_attention import (
+from models.demos.segformer.tt.ttnn_segformer_attention import (
     TtSegformerAttention,
 )
-from models.experimental.functional_segformer.reference.segformer_attention import SegformerAttention
+from models.demos.segformer.reference.segformer_attention import SegformerAttention
 from tests.ttnn.integration_tests.segformer.test_segformer_efficient_selfattention import (
     create_custom_preprocessor as create_customer_preprocessor_selfattention,
 )
@@ -72,7 +72,7 @@ def test_segformer_attention(
     if is_ci_env:
         pytest.skip("Skip in CI, model is WIP, issue# 13357")
 
-    torch_input_tensor = torch.randn(batch_size, seq_len, hidden_size)
+    torch_input_tensor = torch.randn(batch_size, 1, seq_len, hidden_size)
     ttnn_input_tensor = ttnn.from_torch(
         torch_input_tensor,
         dtype=ttnn.bfloat16,
@@ -95,6 +95,7 @@ def test_segformer_attention(
     reference_model.load_state_dict(sd)
     reference_model.eval()
 
+    torch_input_tensor = torch.reshape(torch_input_tensor, (batch_size, seq_len, hidden_size))
     output = reference_model(torch_input_tensor, height, width)
 
     parameters = preprocess_model_parameters(
@@ -116,6 +117,4 @@ def test_segformer_attention(
     if len(ttnn_final_output.shape) == 4:
         ttnn_final_output = ttnn_final_output[0]
 
-    assert_with_pcc(
-        output[0], ttnn_final_output, pcc=0.85
-    )  # 0.97 to 0.85 due to adding parameters to linear and layernorm
+    assert_with_pcc(output[0], ttnn_final_output, pcc=0.99)
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py b/tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py
index 6e898cdede7..61b0ee600f5 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py
@@ -3,13 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import torch
+import math
 import pytest
 import ttnn
 from tests.ttnn.utils_for_testing import assert_with_pcc
 from transformers import SegformerForSemanticSegmentation
 from ttnn.model_preprocessing import fold_batch_norm2d_into_conv2d, preprocess_model_parameters
-from models.experimental.functional_segformer.reference.segformer_decode_head import SegformerDecodeHead
-from models.experimental.functional_segformer.tt.ttnn_segformer_decode_head import TtSegformerDecodeHead
+from models.demos.segformer.reference.segformer_decode_head import SegformerDecodeHead
+from models.demos.segformer.tt.ttnn_segformer_decode_head import TtSegformerDecodeHead
 from tests.ttnn.integration_tests.segformer.test_segformer_mlp import (
     create_custom_preprocessor as create_custom_preprocessor_mlp,
 )
@@ -58,45 +59,39 @@ def test_segformer_decode_head(device, is_ci_env):
     torch_input_tensor_2 = torch.randn(1, 160, 32, 32)
     torch_input_tensor_3 = torch.randn(1, 256, 16, 16)
 
-    if 0:
-        torch_input_tensor_0_folded = torch_input_tensor_0
-        torch_input_tensor_1_folded = torch_input_tensor_1
-        torch_input_tensor_2_folded = torch_input_tensor_2
-        torch_input_tensor_3_folded = torch_input_tensor_3
-    else:
-        torch_input_tensor_0_folded = torch.reshape(torch_input_tensor_0, (batch_size, 32, 128 * 128))
-        torch_input_tensor_0_folded = torch.permute(torch_input_tensor_0_folded, (0, 2, 1))
-        torch_input_tensor_1_folded = torch.reshape(torch_input_tensor_1, (batch_size, 64, 64 * 64))
-        torch_input_tensor_1_folded = torch.permute(torch_input_tensor_1_folded, (0, 2, 1))
-        torch_input_tensor_2_folded = torch.reshape(torch_input_tensor_2, (batch_size, 160, 32 * 32))
-        torch_input_tensor_2_folded = torch.permute(torch_input_tensor_2_folded, (0, 2, 1))
-        torch_input_tensor_3_folded = torch.reshape(torch_input_tensor_3, (batch_size, 256, 16 * 16))
-        torch_input_tensor_3_folded = torch.permute(torch_input_tensor_3_folded, (0, 2, 1))
+    torch_input_tensor_0_folded = torch.reshape(torch_input_tensor_0, (batch_size, 32, 128 * 128))
+    torch_input_tensor_0_folded = torch.permute(torch_input_tensor_0_folded, (0, 2, 1))
+    torch_input_tensor_1_folded = torch.reshape(torch_input_tensor_1, (batch_size, 64, 64 * 64))
+    torch_input_tensor_1_folded = torch.permute(torch_input_tensor_1_folded, (0, 2, 1))
+    torch_input_tensor_2_folded = torch.reshape(torch_input_tensor_2, (batch_size, 160, 32 * 32))
+    torch_input_tensor_2_folded = torch.permute(torch_input_tensor_2_folded, (0, 2, 1))
+    torch_input_tensor_3_folded = torch.reshape(torch_input_tensor_3, (batch_size, 256, 16 * 16))
+    torch_input_tensor_3_folded = torch.permute(torch_input_tensor_3_folded, (0, 2, 1))
 
     ttnn_input_tensor_0 = ttnn.from_torch(
         torch_input_tensor_0_folded,
-        dtype=ttnn.bfloat16,
+        dtype=ttnn.bfloat8_b,
         memory_config=ttnn.L1_MEMORY_CONFIG,
         device=device,
         layout=ttnn.TILE_LAYOUT,
     )
     ttnn_input_tensor_1 = ttnn.from_torch(
         torch_input_tensor_1_folded,
-        dtype=ttnn.bfloat16,
+        dtype=ttnn.bfloat8_b,
         memory_config=ttnn.L1_MEMORY_CONFIG,
         device=device,
         layout=ttnn.TILE_LAYOUT,
     )
     ttnn_input_tensor_2 = ttnn.from_torch(
         torch_input_tensor_2_folded,
-        dtype=ttnn.bfloat16,
+        dtype=ttnn.bfloat8_b,
         memory_config=ttnn.L1_MEMORY_CONFIG,
         device=device,
         layout=ttnn.TILE_LAYOUT,
     )
     ttnn_input_tensor_3 = ttnn.from_torch(
         torch_input_tensor_3_folded,
-        dtype=ttnn.bfloat16,
+        dtype=ttnn.bfloat8_b,
         memory_config=ttnn.L1_MEMORY_CONFIG,
         device=device,
         layout=ttnn.TILE_LAYOUT,
@@ -138,13 +133,12 @@ def test_segformer_decode_head(device, is_ci_env):
 
     ttnn_model = TtSegformerDecodeHead(config, parameters)
     ttnn_output = ttnn_model(ttnn_input_tensor, parameters)
+    ttnn.deallocate(ttnn_input_tensor_0)
 
-    ttnn_output = ttnn.from_device(ttnn_output)
     ttnn_output = ttnn.to_torch(ttnn_output)
+    ttnn_output = torch.permute(ttnn_output, (0, 3, 1, 2))
+    h = w = int(math.sqrt(ttnn_output.shape[-1]))
+    ttnn_output = torch.reshape(ttnn_output, (ttnn_output.shape[0], ttnn_output.shape[1], h, w))
 
-    # torch_output = torch.permute(torch_output,(0,3,2,1))
-    # torch_output = torch.reshape(torch_output,(batch_size, 1, 16384, 256))
-
-    # print("ddd", torch_output.shape, ttnn_output.shape)
-
+    print(torch_output.shape, ttnn_output.shape)
     assert_with_pcc(torch_output, ttnn_output, pcc=0.99)
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_dwconv.py b/tests/ttnn/integration_tests/segformer/test_segformer_dwconv.py
index 02de4d21d23..188ac4ecdfd 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_dwconv.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_dwconv.py
@@ -8,9 +8,9 @@
 
 from tests.ttnn.utils_for_testing import assert_with_pcc
 from transformers import SegformerModel
-from models.experimental.functional_segformer.reference.segformer_dwconv import SegformerDWConv
+from models.demos.segformer.reference.segformer_dwconv import SegformerDWConv
 import pytest
-from models.experimental.functional_segformer.tt.ttnn_segformer_dwconv import TtSegformerDWConv
+from models.demos.segformer.tt.ttnn_segformer_dwconv import TtSegformerDWConv
 from models.utility_functions import skip_for_grayskull
 
 
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_efficient_selfattention.py b/tests/ttnn/integration_tests/segformer/test_segformer_efficient_selfattention.py
index 0b00c9a3e2b..ac7bd074624 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_efficient_selfattention.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_efficient_selfattention.py
@@ -13,11 +13,11 @@
 from transformers import SegformerModel
 import pytest
 from tests.ttnn.utils_for_testing import assert_with_pcc
-from models.experimental.functional_segformer.tt.ttnn_segformer_efficient_selfattention import (
+from models.demos.segformer.tt.ttnn_segformer_efficient_selfattention import (
     TtSegformerEfficientSelfAttention,
 )
 from models.utility_functions import skip_for_grayskull
-from models.experimental.functional_segformer.reference.segformer_efficient_selfattention import (
+from models.demos.segformer.reference.segformer_efficient_selfattention import (
     SegformerEfficientSelfAttention,
 )
 
@@ -48,10 +48,10 @@ def custom_preprocessor(model, name, ttnn_module_args):
 
                 parameters["layer_norm"] = {}
                 parameters["layer_norm"]["weight"] = preprocess_layernorm_parameter(
-                    model.layer_norm.weight, dtype=ttnn.bfloat16
+                    model.layer_norm.weight, dtype=ttnn.bfloat8_b
                 )
                 parameters["layer_norm"]["bias"] = preprocess_layernorm_parameter(
-                    model.layer_norm.bias, dtype=ttnn.bfloat16
+                    model.layer_norm.bias, dtype=ttnn.bfloat8_b
                 )
 
         return parameters
@@ -64,13 +64,13 @@ def custom_preprocessor(model, name, ttnn_module_args):
     "batch_size, seq_len, hidden_size, height, width, num_attention_heads, sequence_reduction_ratio, block_i, efficient_self_attention_i",
     [
         (1, 16384, 32, 128, 128, 1, 8, 0, 0),
-        # (1, 16384, 32, 128, 128, 1, 8, 0, 1),
+        (1, 16384, 32, 128, 128, 1, 8, 0, 1),
         (1, 4096, 64, 64, 64, 2, 4, 1, 0),
-        # (1, 4096, 64, 64, 64, 2, 4, 1, 1),
+        (1, 4096, 64, 64, 64, 2, 4, 1, 1),
         (1, 1024, 160, 32, 32, 5, 2, 2, 0),
-        # (1, 1024, 160, 32, 32, 5, 2, 2, 1),
+        (1, 1024, 160, 32, 32, 5, 2, 2, 1),
         (1, 256, 256, 16, 16, 8, 1, 3, 0),
-        # (1, 256, 256, 16, 16, 8, 1, 3, 1),
+        (1, 256, 256, 16, 16, 8, 1, 3, 1),
     ],
 )
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
@@ -91,7 +91,7 @@ def test_segformer_efficient_selfattention(
     if is_ci_env:
         pytest.skip("Skip in CI, model is WIP, issue# 13357")
 
-    torch_input_tensor = torch.randn(batch_size, seq_len, hidden_size)
+    torch_input_tensor = torch.randn(batch_size, 1, seq_len, hidden_size)
     ttnn_input_tensor = ttnn.from_torch(
         torch_input_tensor,
         dtype=ttnn.bfloat8_b,
@@ -114,6 +114,7 @@ def test_segformer_efficient_selfattention(
     reference_model.load_state_dict(sd)
     reference_model.eval()
 
+    torch_input_tensor = torch.reshape(torch_input_tensor, (batch_size, seq_len, hidden_size))
     torch_output = reference_model(torch_input_tensor, height, width)
 
     parameters = preprocess_model_parameters(
@@ -133,6 +134,4 @@ def test_segformer_efficient_selfattention(
     ttnn_final_output = ttnn.to_torch(ttnn_output[0])
     if len(ttnn_final_output.shape) == 4:
         ttnn_final_output = ttnn_final_output[0]
-    assert_with_pcc(
-        torch_output[0], ttnn_final_output, pcc=0.96
-    )  # 0.98 to 0.96 due to adding parameters for linear and layernorm
+    assert_with_pcc(torch_output[0], ttnn_final_output, pcc=0.98)
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_encoder.py b/tests/ttnn/integration_tests/segformer/test_segformer_encoder.py
index f4d1a2c5cba..bd315c59102 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_encoder.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_encoder.py
@@ -21,10 +21,10 @@
 
 from transformers import SegformerModel, SegformerConfig
 import pytest
-from models.experimental.functional_segformer.tt.ttnn_segformer_encoder import (
+from models.demos.segformer.tt.ttnn_segformer_encoder import (
     TtSegformerEncoder,
 )
-from models.experimental.functional_segformer.reference.segformer_encoder import SegformerEncoder
+from models.demos.segformer.reference.segformer_encoder import SegformerEncoder
 
 
 def create_custom_preprocessor(device):
@@ -52,10 +52,10 @@ def custom_preprocessor(model, name, ttnn_module_args):
             for i in range(4):
                 parameters["layer_norm"][i] = {}
                 parameters["layer_norm"][i]["weight"] = preprocess_layernorm_parameter(
-                    model.layer_norm[i].weight, dtype=ttnn.bfloat16
+                    model.layer_norm[i].weight, dtype=ttnn.bfloat8_b
                 )
                 parameters["layer_norm"][i]["bias"] = preprocess_layernorm_parameter(
-                    model.layer_norm[i].bias, dtype=ttnn.bfloat16
+                    model.layer_norm[i].bias, dtype=ttnn.bfloat8_b
                 )
 
         return parameters
@@ -93,13 +93,6 @@ def test_segformer_encoder(batch_size, num_channels, height, width, device, rese
         pytest.skip("Skip in CI, model is WIP, issue# 13357")
 
     torch_input_tensor = torch.randn(batch_size, num_channels, height, width)
-    ttnn_input_tensor = ttnn.from_torch(
-        torch_input_tensor,
-        dtype=ttnn.bfloat16,
-        memory_config=ttnn.L1_MEMORY_CONFIG,
-        device=device,
-        layout=ttnn.TILE_LAYOUT,
-    )
     torch_model = SegformerModel.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
     config = torch_model.config
 
@@ -127,8 +120,47 @@ def test_segformer_encoder(batch_size, num_channels, height, width, device, rese
 
     ttnn_model = TtSegformerEncoder(config, parameters)
 
+    sharded_input_enabled = 0
+
+    if not sharded_input_enabled:
+        torch_input_tensor_permuted = torch.permute(torch_input_tensor, (0, 2, 3, 1))
+        ttnn_input_tensor = ttnn.from_torch(
+            torch_input_tensor_permuted,
+            dtype=ttnn.bfloat16,
+            memory_config=ttnn.L1_MEMORY_CONFIG,
+            device=device,
+            layout=ttnn.TILE_LAYOUT,
+        )
+    else:
+        torch_input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1))
+        torch_input_tensor = torch.nn.functional.pad(torch_input_tensor, (0, 13, 0, 0, 0, 0, 0, 0))
+        N, H, W, C = torch_input_tensor.shape
+        torch_input_tensor = torch.reshape(torch_input_tensor, (N, 1, H * W, C))
+
+        shard_grid = ttnn.CoreRangeSet(
+            {
+                ttnn.CoreRange(
+                    ttnn.CoreCoord(0, 0),
+                    ttnn.CoreCoord(7, 7),
+                ),
+            }
+        )
+        n_cores = 64
+        shard_spec = ttnn.ShardSpec(shard_grid, [N * H * W // n_cores, C], ttnn.ShardOrientation.ROW_MAJOR, False)
+        input_mem_config = ttnn.MemoryConfig(
+            ttnn.types.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.types.BufferType.L1, shard_spec
+        )
+        ttnn_input_tensor = ttnn.from_torch(
+            torch_input_tensor,
+            dtype=ttnn.bfloat16,
+            layout=ttnn.ROW_MAJOR_LAYOUT,
+            device=device,
+            memory_config=input_mem_config,
+        )
+
     ttnn_output = ttnn_model(ttnn_input_tensor, parameters=parameters)
 
     ttnn_final_output = ttnn.to_torch(ttnn_output.last_hidden_state)
+    torch_final_output = torch.permute(torch_output.last_hidden_state, (0, 2, 3, 1))
 
-    assert_with_pcc(torch_output.last_hidden_state, ttnn_final_output, pcc=0.885)
+    assert_with_pcc(torch_final_output, ttnn_final_output, pcc=0.929)
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py b/tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py
index 849ed588e11..34b1b7d7ea8 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py
@@ -5,16 +5,18 @@
 import pytest
 import ttnn
 from PIL import Image
+import torch
+import math
 
 import requests
 from tests.ttnn.utils_for_testing import assert_with_pcc
 from ttnn.model_preprocessing import preprocess_model_parameters, ParameterDict, ParameterList
-from models.experimental.functional_segformer.tt.ttnn_segformer_for_semantic_segmentation import (
+from models.demos.segformer.tt.ttnn_segformer_for_semantic_segmentation import (
     TtSegformerForSemanticSegmentation,
 )
 from datasets import load_dataset
 from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
-from models.experimental.functional_segformer.reference.segformer_for_semantic_segmentation import (
+from models.demos.segformer.reference.segformer_for_semantic_segmentation import (
     SegformerForSemanticSegmentationReference,
 )
 from tests.ttnn.integration_tests.segformer.test_segformer_model import (
@@ -78,14 +80,6 @@ def test_segformer_for_semantic_segmentation(device, is_ci_env):
     state_dict = torch_model.state_dict()
     inputs = processor(images=image, return_tensors="pt")
 
-    ttnn_input_tensor = ttnn.from_torch(
-        inputs.pixel_values,
-        dtype=ttnn.bfloat16,
-        memory_config=ttnn.L1_MEMORY_CONFIG,
-        device=device,
-        layout=ttnn.TILE_LAYOUT,
-    )
-
     new_state_dict = {}
     keys = [name for name, parameter in reference_model.state_dict().items()]
     values = [parameter for name, parameter in state_dict.items()]
@@ -112,6 +106,15 @@ def test_segformer_for_semantic_segmentation(device, is_ci_env):
 
     ttnn_model = TtSegformerForSemanticSegmentation(config, parameters)
 
+    torch_input_tensor_permuted = torch.permute(inputs.pixel_values, (0, 2, 3, 1))
+    ttnn_input_tensor = ttnn.from_torch(
+        torch_input_tensor_permuted,
+        dtype=ttnn.bfloat16,
+        memory_config=ttnn.L1_MEMORY_CONFIG,
+        device=device,
+        layout=ttnn.TILE_LAYOUT,
+    )
+
     ttnn_output = ttnn_model(
         ttnn_input_tensor,
         output_attentions=None,
@@ -119,6 +122,10 @@ def test_segformer_for_semantic_segmentation(device, is_ci_env):
         return_dict=None,
         parameters=parameters,
     )
-    ttnn_final_output = ttnn.to_torch(ttnn_output.logits)
 
-    assert_with_pcc(torch_output.logits, ttnn_final_output, pcc=0.99)
+    ttnn_output = ttnn.to_torch(ttnn_output.logits)
+    ttnn_output = torch.permute(ttnn_output, (0, 3, 1, 2))
+    h = w = int(math.sqrt(ttnn_output.shape[-1]))
+    ttnn_final_output = torch.reshape(ttnn_output, (ttnn_output.shape[0], ttnn_output.shape[1], h, w))
+
+    assert_with_pcc(torch_output.logits, ttnn_final_output, pcc=0.985)
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_layer.py b/tests/ttnn/integration_tests/segformer/test_segformer_layer.py
index 1377cb413a3..a685a77e59d 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_layer.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_layer.py
@@ -14,11 +14,11 @@
 
 from transformers import SegformerModel
 import pytest
-from models.experimental.functional_segformer.tt.ttnn_segformer_layer import (
+from models.demos.segformer.tt.ttnn_segformer_layer import (
     TtSegformerLayer,
 )
 
-from models.experimental.functional_segformer.reference.segformer_layer import SegformerLayer
+from models.demos.segformer.reference.segformer_layer import SegformerLayer
 from tests.ttnn.integration_tests.segformer.test_segformer_mix_ffn import (
     create_custom_preprocessor as create_custom_preprocessor_mix_ffn,
 )
@@ -34,10 +34,10 @@ def custom_preprocessor(model, name, ttnn_module_args):
         if isinstance(model, SegformerLayer):
             parameters["layer_norm_1"] = {}
             parameters["layer_norm_1"]["weight"] = preprocess_layernorm_parameter(
-                model.layer_norm_1.weight, dtype=ttnn.bfloat16
+                model.layer_norm_1.weight, dtype=ttnn.bfloat8_b
             )
             parameters["layer_norm_1"]["bias"] = preprocess_layernorm_parameter(
-                model.layer_norm_1.bias, dtype=ttnn.bfloat16
+                model.layer_norm_1.bias, dtype=ttnn.bfloat8_b
             )
 
             attention_preprocess = create_custom_preprocessor_attention(device)
@@ -50,10 +50,10 @@ def custom_preprocessor(model, name, ttnn_module_args):
 
             parameters["layer_norm_2"] = {}
             parameters["layer_norm_2"]["weight"] = preprocess_layernorm_parameter(
-                model.layer_norm_2.weight, dtype=ttnn.bfloat16
+                model.layer_norm_2.weight, dtype=ttnn.bfloat8_b
             )
             parameters["layer_norm_2"]["bias"] = preprocess_layernorm_parameter(
-                model.layer_norm_2.bias, dtype=ttnn.bfloat16
+                model.layer_norm_2.bias, dtype=ttnn.bfloat8_b
             )
         return parameters
 
@@ -111,7 +111,7 @@ def test_segformer_layer(
     if is_ci_env:
         pytest.skip("Skip in CI, model is WIP, issue# 13357")
 
-    torch_input_tensor = torch.randn(batch_size, seq_len, hidden_size)
+    torch_input_tensor = torch.randn(batch_size, 1, seq_len, hidden_size)
     ttnn_input_tensor = ttnn.from_torch(
         torch_input_tensor,
         dtype=ttnn.bfloat16,
@@ -136,6 +136,7 @@ def test_segformer_layer(
     reference_model.load_state_dict(sd)
     reference_model.eval()
 
+    torch_input_tensor = torch.reshape(torch_input_tensor, (batch_size, seq_len, hidden_size))
     torch_output = reference_model(torch_input_tensor, height=height, width=width)
 
     parameters = preprocess_model_parameters(
@@ -157,4 +158,4 @@ def test_segformer_layer(
     if len(ttnn_final_output.shape) == 4:
         ttnn_final_output = ttnn_final_output[0]
 
-    assert_with_pcc(torch_output[0], ttnn_final_output, pcc=0.94)
+    assert_with_pcc(torch_output[0], ttnn_final_output, pcc=0.99)
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_mix_ffn.py b/tests/ttnn/integration_tests/segformer/test_segformer_mix_ffn.py
index 80b03d7a736..ac2fbd79692 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_mix_ffn.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_mix_ffn.py
@@ -13,8 +13,8 @@
 from transformers import SegformerModel
 
 import pytest
-from models.experimental.functional_segformer.tt.ttnn_segformer_mix_ffn import TtSegformerMixFFN
-from models.experimental.functional_segformer.reference.segformer_mixffn import SegformerMixFFN
+from models.demos.segformer.tt.ttnn_segformer_mix_ffn import TtSegformerMixFFN
+from models.demos.segformer.reference.segformer_mixffn import SegformerMixFFN
 from tests.ttnn.integration_tests.segformer.test_segformer_dwconv import (
     create_custom_preprocessor as create_custom_preprocessor_dwconv,
 )
@@ -111,4 +111,4 @@ def test_segformer_mix_ffn(
     ttnn_output = ttnn.from_device(ttnn_output)
     ttnn_output = ttnn.to_torch(ttnn_output)[0]
 
-    assert_with_pcc(torch_output, ttnn_output, pcc=0.96)
+    assert_with_pcc(torch_output, ttnn_output, pcc=0.99)
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_mlp.py b/tests/ttnn/integration_tests/segformer/test_segformer_mlp.py
index 80ad040fb03..ae9101601f3 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_mlp.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_mlp.py
@@ -7,11 +7,11 @@
 import torch
 from tests.ttnn.utils_for_testing import assert_with_pcc
 from ttnn.model_preprocessing import preprocess_model_parameters, preprocess_linear_weight, preprocess_linear_bias
-from models.experimental.functional_segformer.tt.ttnn_segformer_mlp import (
+from models.demos.segformer.tt.ttnn_segformer_mlp import (
     TtSegformerMLP,
 )
 
-from models.experimental.functional_segformer.reference.segformer_mlp import (
+from models.demos.segformer.reference.segformer_mlp import (
     SegformerMLP,
 )
 from transformers import SegformerForSemanticSegmentation
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_model.py b/tests/ttnn/integration_tests/segformer/test_segformer_model.py
index bccf5dfd7cf..c10e2ac8ba8 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_model.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_model.py
@@ -12,10 +12,10 @@
 
 from transformers import SegformerModel
 import pytest
-from models.experimental.functional_segformer.tt.ttnn_segformer_model import (
+from models.demos.segformer.tt.ttnn_segformer_model import (
     TtSegformerModel,
 )
-from models.experimental.functional_segformer.reference.segformer_model import SegformerModelReference
+from models.demos.segformer.reference.segformer_model import SegformerModelReference
 from models.utility_functions import skip_for_grayskull
 
 
@@ -70,13 +70,7 @@ def test_segformer_model(
         pytest.skip("Skip in CI, model is WIP, issue# 13357")
 
     torch_input_tensor = torch.randn(batch_size, num_channels, height, width)
-    ttnn_input_tensor = ttnn.from_torch(
-        torch_input_tensor,
-        dtype=ttnn.bfloat16,
-        memory_config=ttnn.L1_MEMORY_CONFIG,
-        device=device,
-        layout=ttnn.TILE_LAYOUT,
-    )
+
     torch_model = SegformerModel.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
     config = torch_model.config
 
@@ -102,6 +96,15 @@ def test_segformer_model(
 
     ttnn_model = TtSegformerModel(config, parameters)
 
+    torch_input_tensor_permuted = torch.permute(torch_input_tensor, (0, 2, 3, 1))
+    ttnn_input_tensor = ttnn.from_torch(
+        torch_input_tensor_permuted,
+        dtype=ttnn.bfloat16,
+        memory_config=ttnn.L1_MEMORY_CONFIG,
+        device=device,
+        layout=ttnn.TILE_LAYOUT,
+    )
+
     ttnn_output = ttnn_model(
         ttnn_input_tensor,
         output_attentions=None,
@@ -110,5 +113,6 @@ def test_segformer_model(
         parameters=parameters,
     )
     ttnn_final_output = ttnn.to_torch(ttnn_output[0])
+    torch_final_output = torch.permute(torch_output.last_hidden_state, (0, 2, 3, 1))
 
-    assert_with_pcc(torch_output[0], ttnn_final_output, pcc=0.885)
+    assert_with_pcc(torch_final_output, ttnn_final_output, pcc=0.929)
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_overlap_path_embeddings.py b/tests/ttnn/integration_tests/segformer/test_segformer_overlap_path_embeddings.py
index fd080e1f8d7..6f6d22ca9d2 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_overlap_path_embeddings.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_overlap_path_embeddings.py
@@ -11,11 +11,11 @@
 from tests.ttnn.utils_for_testing import assert_with_pcc
 from transformers import SegformerModel
 import pytest
-from models.experimental.functional_segformer.tt.ttnn_segformer_overlap_patch_embeddings import (
+from models.demos.segformer.tt.ttnn_segformer_overlap_patch_embeddings import (
     TtSegformerOverlapPatchEmbeddings,
 )
 
-from models.experimental.functional_segformer.reference.segformer_overlap_patch_embeddings import (
+from models.demos.segformer.reference.segformer_overlap_patch_embeddings import (
     SegformerOverlapPatchEmbeddings,
 )
 from models.utility_functions import skip_for_grayskull
@@ -34,10 +34,10 @@ def custom_preprocessor(model, name, ttnn_module_args):
 
             parameters["layer_norm"] = {}
             parameters["layer_norm"]["weight"] = preprocess_layernorm_parameter(
-                model.layer_norm.weight, dtype=ttnn.bfloat16
+                model.layer_norm.weight, dtype=ttnn.bfloat8_b
             )
             parameters["layer_norm"]["bias"] = preprocess_layernorm_parameter(
-                model.layer_norm.bias, dtype=ttnn.bfloat16
+                model.layer_norm.bias, dtype=ttnn.bfloat8_b
             )
 
         return parameters
@@ -73,22 +73,6 @@ def test_segformer_overlap_patch_embeddings(
         pytest.skip("Skip in CI, model is WIP, issue# 13357")
 
     torch_input_tensor = torch.randn(batch_size, num_channels, height, width)
-    if width == 512:
-        ttnn_input_tensor = ttnn.from_torch(
-            torch_input_tensor,
-            dtype=ttnn.bfloat16,
-            memory_config=ttnn.L1_MEMORY_CONFIG,
-            device=device,
-            layout=ttnn.TILE_LAYOUT,
-        )
-    else:
-        ttnn_input_tensor = ttnn.from_torch(
-            torch_input_tensor,
-            dtype=ttnn.bfloat8_b,
-            memory_config=ttnn.L1_MEMORY_CONFIG,
-            device=device,
-            layout=ttnn.TILE_LAYOUT,
-        )
 
     torch_model = SegformerModel.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
 
@@ -114,6 +98,27 @@ def test_segformer_overlap_patch_embeddings(
         stride=stride,
     )
 
+    post_process_it = 0
+    if width == 512:
+        torch_input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1))
+        ttnn_input_tensor = ttnn.from_torch(
+            torch_input_tensor,
+            dtype=ttnn.bfloat16,
+            memory_config=ttnn.L1_MEMORY_CONFIG,
+            device=device,
+            layout=ttnn.TILE_LAYOUT,
+        )
+    else:
+        torch_input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1))
+        ttnn_input_tensor = ttnn.from_torch(
+            torch_input_tensor,
+            dtype=ttnn.bfloat16,
+            memory_config=ttnn.L1_MEMORY_CONFIG,
+            device=device,
+            layout=ttnn.ROW_MAJOR_LAYOUT,
+        )
+        post_process_it = 1
+
     ttnn_output, height, width = ttnn_model(
         ttnn_input_tensor,
         parameters=parameters,
diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_selfoutput.py b/tests/ttnn/integration_tests/segformer/test_segformer_selfoutput.py
index 525795c8c0f..bb0e766fcc2 100644
--- a/tests/ttnn/integration_tests/segformer/test_segformer_selfoutput.py
+++ b/tests/ttnn/integration_tests/segformer/test_segformer_selfoutput.py
@@ -12,8 +12,8 @@
 from tests.ttnn.utils_for_testing import assert_with_pcc
 from transformers import SegformerModel
 import pytest
-from models.experimental.functional_segformer.tt.ttnn_segformer_selfoutput import TtSegformerSelfOutput
-from models.experimental.functional_segformer.reference.segformer_selfoutput import SegformerSelfOutput
+from models.demos.segformer.tt.ttnn_segformer_selfoutput import TtSegformerSelfOutput
+from models.demos.segformer.reference.segformer_selfoutput import SegformerSelfOutput
 from models.utility_functions import skip_for_grayskull
 
 
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
index 00be4435617..31f4636aa66 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
@@ -12,7 +12,7 @@ def perf_report(file_path):
 
     df = df.dropna(subset=["DEVICE ERISC KERNEL DURATION [ns]"])
     df = df[df["OP TO OP LATENCY [ns]"] != 0]
-    df = df[df["TRACE ID"].notna() & (df["TRACE ID"] != "")]
+    df = df[df["METAL TRACE ID"].notna() & (df["METAL TRACE ID"] != "")]
 
     def remove_keys_from_attributes(attributes):
         attributes = attributes.replace(";", ",").replace("'", '"')
@@ -56,7 +56,9 @@ def safe_parse_attributes(attributes):
     )
 
     df["dim"] = df["ATTRIBUTES"].apply(
-        lambda x: safe_parse_attributes(x).get("dim", "") if isinstance(safe_parse_attributes(x), dict) else ""
+        lambda x: safe_parse_attributes(x).get("dim", safe_parse_attributes(x).get("scatter_dim", ""))
+        if isinstance(safe_parse_attributes(x), dict)
+        else ""
     )
 
     df["num_links"] = df["ATTRIBUTES"].apply(
@@ -154,15 +156,15 @@ def calculate_bandwidth(row):
                 op_bw = (output_tensor_volume * (n_chips - 1) / n_chips) / longest_device_fw_time
                 link_bw = (output_tensor_volume * (n_chips - 1) / n_chips) / longest_erisc_fw_time
             elif row["OP CODE"] == "ReduceScatter":
-                op_bw = (input_tensor_volume / n_chips) / longest_device_fw_time
-                link_bw = (input_tensor_volume * (n_chips - 1) / n_chips) / longest_erisc_fw_time
+                op_bw = input_tensor_volume / longest_device_fw_time
+                link_bw = input_tensor_volume / longest_erisc_fw_time
         elif row["topology"] == "Linear":
             if row["OP CODE"] == "AllGather":
                 op_bw = input_tensor_volume * n_chips / longest_device_fw_time
                 link_bw = input_tensor_volume * (n_chips - 1) / longest_erisc_fw_time
             elif row["OP CODE"] == "ReduceScatter":
                 op_bw = input_tensor_volume / longest_device_fw_time
-                link_bw = input_tensor_volume * (n_chips - 1) / n_chips / longest_erisc_fw_time
+                link_bw = input_tensor_volume / longest_erisc_fw_time
         return round(op_bw, 2), round(link_bw, 2)
 
     for i, (group, group_df) in enumerate(grouped, start=1):
@@ -194,13 +196,17 @@ def calculate_bandwidth(row):
             "output_mem_config": group_df["output_mem_config"].iloc[0] if "output_mem_config" in group_df else "",
             "topology": group_df["topology"].iloc[0],
             "Layout": group_df["Layout"].iloc[0] if "Layout" in group_df else "",
+            "Data Type": group_df["Data Type"].iloc[0] if "Data Type" in group_df else "",
         }
 
         for column in numeric_columns:
             min_val = round(group_df[column].min(), 2)
             largest_vals = group_df[column].nlargest(3)
             max_val = round(largest_vals.iloc[-1], 2)
-            avg_val = round(group_df[column][~group_df[column].isin(largest_vals.head(2))].mean(), 2)
+            if min_val == max_val:
+                avg_val = min_val
+            else:
+                avg_val = round(group_df[column][~group_df[column].isin(largest_vals.head(2))].mean(), 2)
 
             group_data[column] = f"{min_val} - {avg_val} - {max_val}"
 
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
similarity index 100%
rename from tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh
rename to tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh
new file mode 100755
index 00000000000..23071225ac1
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh
@@ -0,0 +1,97 @@
+#!/bin/sh
+MODULE_DIR="tests/ttnn/unit_tests/operations/ccl/perf"
+
+# Defaults
+DEBUG=false
+TARGET="n300"
+
+# Function to display help
+show_help() {
+    echo "Usage: ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh [OPTIONS]"
+    echo
+    echo "Options:"
+    echo "  -d, --debug        Enable debug mode to show real-time output."
+    echo "  -t, --target       Specify the target configuration (t3000 or n300). Default is n300."
+    echo "  -h, --help         Display this help message."
+    echo
+    echo "Example:"
+    echo "  ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh --debug --target n300"
+    echo "  ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -h"
+}
+
+# Parse command-line arguments
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --debug|-d)
+            DEBUG=true
+            shift
+            ;;
+        --help|-h)
+            show_help
+            exit 0
+            ;;
+        --target|-t)
+            # Ensure there is an argument following the target flag
+            if [ -z "$2" ]; then
+                echo "Error: No target specified after $1."
+                show_help
+                exit 1
+            fi
+
+            TARGET="$2"  # Set the target configuration
+            shift 2
+
+            # Validate the target value
+            if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "n300" ]; then
+                echo "Error: Invalid target configuration: $TARGET. Must be either 't3000' or 'n300'."
+                exit 1
+            fi
+            ;;
+        *)
+            echo "Unknown option: $1"
+            show_help
+            exit 1
+            ;;
+    esac
+done
+
+# Function to run the profiling command and extract the CSV path
+run_profile_and_extract_csv() {
+    command="./tt_metal/tools/profiler/profile_this.py -n reduce_scatter_$TARGET -c 'pytest tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py::test_reduce_scatter_on_$TARGET'"
+
+    if [ "$DEBUG" = true ]; then
+        echo "Running profiling command for target $TARGET in debug mode..."
+        full_output=$(eval $command 2>&1 | tee /dev/tty)
+    else
+        echo "Running profiling command for target $TARGET..."
+        full_output=$(eval $command 2>&1)
+    fi
+
+    # Extract the CSV path
+    csv_path=$(echo "$full_output" | grep -oE 'OPs csv generated at: (.+\.csv)' | sed -E 's/OPs csv generated at: //')
+
+    if [ -n "$csv_path" ]; then
+        echo "CSV path found: $csv_path"
+
+        # Run the Python script to generate performance report
+        average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
+import pandas as pd
+from perf_csv import perf_report
+from tabulate import tabulate
+
+# Generate the report and convert it to a DataFrame
+average_df = perf_report('$csv_path')
+# Print the DataFrame in a pretty table format
+print(tabulate(average_df, headers='keys', tablefmt='pretty'))
+")
+
+        # Print the output
+        echo "Min - Avg - Max by Common Runs:"
+        echo "$average_values"
+    else
+        echo "CSV path not found in the command output."
+    fi
+}
+
+# Run the function
+run_profile_and_extract_csv
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
index c9a6c90ef31..1429eb0fce1 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
@@ -9,6 +9,9 @@
     run_all_gather_on_n300_impl,
     run_all_gather_on_t3000_impl_tight_loop,
 )
+from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_post_commit import (
+    run_reduce_scatter_test,
+)
 
 
 @skip_for_grayskull("Requires eth connected devices to run")
@@ -128,3 +131,138 @@ def test_all_gather_on_t3000(
         enable_async=enable_async,
         trace_mode=True,
     )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links",
+    [
+        (8, 1),
+    ],
+)
+@pytest.mark.parametrize(
+    "per_chip_output_shape, scatter_dim, layout",
+    [
+        ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT),
+        ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT),
+        ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 32 * 8], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 64 * 8], 3, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+    ],
+)
+@pytest.mark.parametrize(
+    "mem_config",
+    [
+        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
+    ],
+)
+@pytest.mark.parametrize("num_iters", [20])
+@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("topology", [ttnn.Topology.Linear, ttnn.Topology.Ring])
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True)
+def test_reduce_scatter_on_t3000(
+    t3k_mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    scatter_dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    mem_config,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    num_iters,
+    topology,
+):
+    run_reduce_scatter_test(
+        t3k_mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        scatter_dim,
+        num_links,
+        math_op,
+        input_dtype,
+        layout,
+        mem_config,
+        use_program_cache,
+        function_level_defaults,
+        num_iters=num_iters,
+        enable_async=enable_async,
+        topology=topology,
+        trace_mode=True,
+    )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links",
+    [
+        (2, 1),
+    ],
+)
+@pytest.mark.parametrize(
+    "per_chip_output_shape, scatter_dim, layout",
+    [
+        ([1, 1, 32, 4096], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 1024], 3, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize(
+    "mem_config",
+    [
+        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
+        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1),
+    ],
+)
+@pytest.mark.parametrize("num_iters", [20])
+@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True)
+def test_reduce_scatter_on_n300(
+    n300_mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    scatter_dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    mem_config,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    num_iters,
+):
+    run_reduce_scatter_test(
+        n300_mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        scatter_dim,
+        num_links,
+        math_op,
+        input_dtype,
+        layout,
+        mem_config,
+        use_program_cache,
+        function_level_defaults,
+        num_iters=num_iters,
+        enable_async=enable_async,
+        trace_mode=True,
+    )
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
index 9fbc710ed7c..916682dd84e 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
@@ -32,9 +32,10 @@ def run_with_trace(
     num_links,
     math_op,
     output_mem_config,
-    n_worker,
-    n_buffer,
-    num_iters,
+    n_worker=None,
+    n_buffer=None,
+    num_iters=40,
+    topology=ttnn.Topology.Ring,
 ):
     # Compile Run
     logger.info("Compiling model")
@@ -46,6 +47,7 @@ def run_with_trace(
         memory_config=output_mem_config,
         num_workers=n_worker,
         num_buffers_per_channel=n_buffer,
+        topology=topology,
     )
     for device_id in t3k_mesh_device.get_device_ids():
         ttnn.synchronize_device(t3k_mesh_device.get_device(device_id))
@@ -62,6 +64,7 @@ def run_with_trace(
             memory_config=output_mem_config,
             num_workers=n_worker,
             num_buffers_per_channel=n_buffer,
+            topology=topology,
         )
     ttnn.end_trace_capture(t3k_mesh_device, trace_id, cq_id=0)
     for device_id in t3k_mesh_device.get_device_ids():
@@ -92,6 +95,7 @@ def run_reduce_scatter_test(
     enable_async=True,
     num_iters=1,
     topology=ttnn.Topology.Ring,
+    trace_mode=False,
 ):
     if len(mesh_device.get_device_ids()) < num_devices:
         pytest.skip(
@@ -135,19 +139,31 @@ def run_reduce_scatter_test(
 
     input_tensor_mesh = ttnn.aggregate_as_tensor(tt_input_tensors)
     # Run the op
-    for i in range(num_iters):
-        output_tensor_mesh = ttnn.reduce_scatter(
+    if trace_mode:
+        output_tensor_mesh = run_with_trace(
+            mesh_device,
             input_tensor_mesh,
-            scatter_dim=scatter_dim,
-            math_op=math_op,
-            num_links=num_links,
-            memory_config=mem_config,
+            scatter_dim,
+            num_links,
+            math_op,
+            mem_config,
+            num_iters=num_iters,
             topology=topology,
         )
+    else:
+        for i in range(num_iters):
+            output_tensor_mesh = ttnn.reduce_scatter(
+                input_tensor_mesh,
+                scatter_dim=scatter_dim,
+                math_op=math_op,
+                num_links=num_links,
+                memory_config=mem_config,
+                topology=topology,
+            )
 
-        for device_id in mesh_device.get_device_ids():
-            ttnn.synchronize_device(mesh_device.get_device(device_id))
-        logger.info(f"Done iteration {i}")
+            for device_id in mesh_device.get_device_ids():
+                ttnn.synchronize_device(mesh_device.get_device(device_id))
+            logger.info(f"Done iteration {i}")
 
     # ttnn.visualize_mesh_device(t3k_mesh_device, tensor=output_tensor_mesh)
     # Compute golden
diff --git a/tt_metal/hw/inc/blackhole/eth_interface.h b/tt_metal/hw/inc/blackhole/eth_interface.h
deleted file mode 100644
index 676ba847483..00000000000
--- a/tt_metal/hw/inc/blackhole/eth_interface.h
+++ /dev/null
@@ -1,7 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "tt_metal/third_party/umd/src/firmware/riscv/blackhole/eth_interface.h"
diff --git a/tt_metal/hw/inc/grayskull/eth_interface.h b/tt_metal/hw/inc/grayskull/eth_interface.h
deleted file mode 100644
index 1f965f087db..00000000000
--- a/tt_metal/hw/inc/grayskull/eth_interface.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-const uint32_t ETH_RACK_COORD_WIDTH = 0;
-const uint32_t CMD_BUF_SIZE_MASK = 0;
-const uint32_t MAX_BLOCK_SIZE = 0;
-const uint32_t REQUEST_CMD_QUEUE_BASE = 0;
-const uint32_t RESPONSE_CMD_QUEUE_BASE = 0;
-const uint32_t CMD_COUNTERS_SIZE_BYTES = 0;
-const uint32_t REMOTE_UPDATE_PTR_SIZE_BYTES = 0;
-const uint32_t CMD_DATA_BLOCK = 0;
-const uint32_t CMD_WR_REQ = 0;
-const uint32_t CMD_WR_ACK = 0;
-const uint32_t CMD_RD_REQ = 0;
-const uint32_t CMD_RD_DATA = 0;
-const uint32_t CMD_BUF_SIZE = 0;
-const uint32_t CMD_DATA_BLOCK_DRAM = 0;
-const uint32_t ETH_ROUTING_DATA_BUFFER_ADDR = 0;
-const uint32_t REQUEST_ROUTING_CMD_QUEUE_BASE = 0;
-const uint32_t RESPONSE_ROUTING_CMD_QUEUE_BASE = 0;
-const uint32_t CMD_BUF_PTR_MASK = 0;
diff --git a/tt_metal/hw/inc/wormhole/eth_interface.h b/tt_metal/hw/inc/wormhole/eth_interface.h
deleted file mode 100644
index 53285a9529e..00000000000
--- a/tt_metal/hw/inc/wormhole/eth_interface.h
+++ /dev/null
@@ -1,7 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "tt_metal/third_party/umd/src/firmware/riscv/wormhole/eth_interface.h"
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 8098a0cc5f1..17cacd5e51a 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -2378,9 +2378,17 @@ void HWCommandQueue::enqueue_trace(const uint32_t trace_id, bool blocking) {
     // Increment the expected worker cores counter due to trace programs completion
     this->expected_num_workers_completed += trace_inst->desc->num_completion_worker_cores;
     // After trace runs, the rdptr on each worker will be incremented by the number of programs in the trace
-    // Update the wptr on host to match state
-    this->device->worker_launch_message_buffer_state.set_mcast_wptr(trace_inst->desc->num_traced_programs_needing_go_signal_multicast);
-    this->device->worker_launch_message_buffer_state.set_unicast_wptr(trace_inst->desc->num_traced_programs_needing_go_signal_unicast);
+    // Update the wptr on host to match state. If the trace doesn't execute on a
+    // class of worker (unicast or multicast), it doesn't reset or modify the
+    // state for those workers.
+    if (trace_inst->desc->num_traced_programs_needing_go_signal_multicast) {
+        this->device->worker_launch_message_buffer_state.set_mcast_wptr(
+            trace_inst->desc->num_traced_programs_needing_go_signal_multicast);
+    }
+    if (trace_inst->desc->num_traced_programs_needing_go_signal_unicast) {
+        this->device->worker_launch_message_buffer_state.set_unicast_wptr(
+            trace_inst->desc->num_traced_programs_needing_go_signal_unicast);
+    }
     // The config buffer manager is unaware of what memory is used inside the trace, so mark all memory as used so that
     // it will force a stall and avoid stomping on in-use state.
     // TODO(jbauman): Reuse old state from the trace.
diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp
index 05a19ea4d47..04ff630f1f3 100644
--- a/tt_metal/impl/dispatch/command_queue_interface.hpp
+++ b/tt_metal/impl/dispatch/command_queue_interface.hpp
@@ -170,11 +170,11 @@ struct dispatch_constants {
             if (dev_addr_type == CommandQueueDeviceAddrType::PREFETCH_Q_RD) {
                 device_cq_addr_sizes_[dev_addr_idx] = sizeof(uint32_t);
             } else if (dev_addr_type == CommandQueueDeviceAddrType::PREFETCH_Q_PCIE_RD) {
-                device_cq_addr_sizes_[dev_addr_idx] = L1_ALIGNMENT - sizeof(uint32_t);
+                device_cq_addr_sizes_[dev_addr_idx] = l1_alignment - sizeof(uint32_t);
             } else if (dev_addr_type == CommandQueueDeviceAddrType::DISPATCH_MESSAGE) {
-                device_cq_addr_sizes_[dev_addr_idx] = 32; // Should this be 2x L1_ALIGNMENT?
+                device_cq_addr_sizes_[dev_addr_idx] = 32; // Should this be 2x l1_alignment?
             } else {
-                device_cq_addr_sizes_[dev_addr_idx] = L1_ALIGNMENT;
+                device_cq_addr_sizes_[dev_addr_idx] = l1_alignment;
             }
         }
 
diff --git a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
index 763e6567b01..7e17ccfaca0 100644
--- a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp
@@ -6,6 +6,8 @@
 
 #define COMPILE_FOR_IDLE_ERISC
 
+#include <cstdint>
+
 #include "llrt/hal.hpp"
 #include "llrt/blackhole/bh_hal.hpp"
 #include "hw/inc/blackhole/core_config.h"
@@ -29,7 +31,7 @@ HalCoreInfoType create_active_eth_mem_map() {
     std::vector<DeviceAddr> mem_map_bases;
 
     mem_map_bases.resize(utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::COUNT));
-    mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::BARRIER)] = MEM_L1_BARRIER;
+    mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::BARRIER)] = eth_l1_mem::address_map::ERISC_BARRIER_BASE;
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::MAILBOX)] = eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE;
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch);
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::WATCHER)] = GET_ETH_MAILBOX_ADDRESS_HOST(watcher);
@@ -40,10 +42,11 @@ HalCoreInfoType create_active_eth_mem_map() {
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::CORE_INFO)] = GET_ETH_MAILBOX_ADDRESS_HOST(core_info);
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = GET_ETH_MAILBOX_ADDRESS_HOST(go_message);
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr);
+    mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::FW_VERSION_ADDR)] = eth_l1_mem::address_map::FW_VERSION_ADDR;
 
     std::vector<uint32_t> mem_map_sizes;
     mem_map_sizes.resize(utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::COUNT));
-    mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::BARRIER)] = sizeof(uint32_t);
+    mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::BARRIER)] = eth_l1_mem::address_map::ERISC_BARRIER_SIZE;
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::MAILBOX)] = eth_l1_mem::address_map::ERISC_MEM_MAILBOX_SIZE;
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH)] = sizeof(launch_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::WATCHER)] = sizeof(watcher_msg_t);
@@ -53,6 +56,7 @@ HalCoreInfoType create_active_eth_mem_map() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::UNRESERVED)] = eth_l1_mem::address_map::MAX_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
+    mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::FW_VERSION_ADDR)] = sizeof(std::uint32_t);
 
     // TODO (abhullar): This should be NumEthDispatchClasses
     std::vector<std::vector<uint8_t>> processor_classes(1);
diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/llrt/hal.hpp
index 875252ca8da..1ba7a104e84 100644
--- a/tt_metal/llrt/hal.hpp
+++ b/tt_metal/llrt/hal.hpp
@@ -50,6 +50,7 @@ enum class HalL1MemAddrType : uint8_t {
     CORE_INFO,
     GO_MSG,
     LAUNCH_MSG_BUFFER_RD_PTR,
+    FW_VERSION_ADDR, // Really only applicable to active eth core right now
     COUNT // Keep this last so it always indicates number of enum options
 };
 
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index 265e928514b..715ab3c974e 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -271,7 +271,6 @@ void Cluster::open_driver(const bool &skip_driver_allocs) {
                 device_driver->configure_active_ethernet_cores_for_mmio_device(mmio_device_id, {});
             }
         }
-        device_driver->set_driver_eth_interface_params(eth_interface_params);
 
         // Adding this check is a workaround for current UMD bug that only uses this getter to populate private metadata
         // that is later expected to be populated by unrelated APIs
@@ -281,6 +280,12 @@ void Cluster::open_driver(const bool &skip_driver_allocs) {
     }
     std::uint32_t dram_barrier_base = tt_metal::hal.get_dev_addr(tt_metal::HalDramMemAddrType::DRAM_BARRIER);
     device_driver->set_device_dram_address_params(tt_device_dram_address_params{dram_barrier_base});
+
+    l1_address_params.tensix_l1_barrier_base = tt_metal::hal.get_dev_addr(tt_metal::HalProgrammableCoreType::TENSIX, tt_metal::HalL1MemAddrType::BARRIER);
+    if (tt_metal::hal.get_arch() != tt::ARCH::GRAYSKULL) {
+        l1_address_params.eth_l1_barrier_base = tt_metal::hal.get_dev_addr(tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt_metal::HalL1MemAddrType::BARRIER);
+        l1_address_params.fw_version_addr = tt_metal::hal.get_dev_addr(tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt_metal::HalL1MemAddrType::FW_VERSION_ADDR);
+    }
     device_driver->set_device_l1_address_params(l1_address_params);
 
     this->get_metal_desc_from_tt_desc(
@@ -344,18 +349,6 @@ uint32_t Cluster::get_harvested_rows(chip_id_t chip) const {
     }
 }
 
-void Cluster::verify_eth_fw() const {
-    for (const auto &[chip, mmio_device_id] : this->device_to_mmio_device_) {
-        std::vector<uint32_t> fw_versions;
-        for (const CoreCoord &eth_core : get_soc_desc(chip).ethernet_cores) {
-            uint32_t val;
-            read_core(&val, sizeof(uint32_t), tt_cxy_pair(chip, eth_core), eth_l1_mem::address_map::FW_VERSION_ADDR);
-            fw_versions.push_back(val);
-        }
-        verify_sw_fw_versions(chip, SW_VERSION, fw_versions);
-    }
-}
-
 int Cluster::get_device_aiclk(const chip_id_t &chip_id) const {
     if (this->arch_ == tt::ARCH::BLACKHOLE) {
         // For Blackhole bring up remove AICLK query due to lack of ARC message support
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp
index 26fc0409d71..f59bf5d42e9 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/llrt/tt_cluster.hpp
@@ -17,7 +17,6 @@
 
 // clang-format off
 #include "noc/noc_parameters.h"
-#include "eth_interface.h"
 #include "eth_l1_address_map.h"
 #include "dev_msgs.h"
 // clang-format on
@@ -72,7 +71,6 @@ class Cluster {
     }
 
     //! device driver and misc apis
-    void verify_eth_fw() const;
     void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector<std::uint32_t> &fw_versions) const;
 
     void deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const;
@@ -285,39 +283,7 @@ class Cluster {
     // Mapping of each devices' ethernet routing mode
     std::unordered_map<chip_id_t, std::unordered_map<CoreCoord, EthRouterMode>> device_eth_routing_info_;
 
-    tt_device_l1_address_params l1_address_params = {
-        (uint32_t)MEM_NCRISC_FIRMWARE_BASE,
-        (uint32_t)MEM_BRISC_FIRMWARE_BASE,
-        (uint32_t)MEM_TRISC0_FIRMWARE_SIZE,
-        (uint32_t)MEM_TRISC1_FIRMWARE_SIZE,
-        (uint32_t)MEM_TRISC2_FIRMWARE_SIZE,
-        (uint32_t)MEM_TRISC0_FIRMWARE_BASE,
-        (uint32_t)MEM_L1_BARRIER,
-        (uint32_t)eth_l1_mem::address_map::ERISC_BARRIER_BASE,
-        (uint32_t)eth_l1_mem::address_map::FW_VERSION_ADDR,
-    };
-
-    tt_driver_eth_interface_params eth_interface_params = {
-        NOC_ADDR_LOCAL_BITS,
-        NOC_ADDR_NODE_ID_BITS,
-        ETH_RACK_COORD_WIDTH,
-        CMD_BUF_SIZE_MASK,
-        MAX_BLOCK_SIZE,
-        REQUEST_CMD_QUEUE_BASE,
-        RESPONSE_CMD_QUEUE_BASE,
-        CMD_COUNTERS_SIZE_BYTES,
-        REMOTE_UPDATE_PTR_SIZE_BYTES,
-        CMD_DATA_BLOCK,
-        CMD_WR_REQ,
-        CMD_WR_ACK,
-        CMD_RD_REQ,
-        CMD_RD_DATA,
-        CMD_BUF_SIZE,
-        CMD_DATA_BLOCK_DRAM,
-        ETH_ROUTING_DATA_BUFFER_ADDR,
-        REQUEST_ROUTING_CMD_QUEUE_BASE,
-        RESPONSE_ROUTING_CMD_QUEUE_BASE,
-        CMD_BUF_PTR_MASK};
+    tt_device_l1_address_params l1_address_params;
 
     std::unordered_map<chip_id_t, std::unordered_map<chip_id_t, std::vector<CoreCoord>>> ethernet_sockets_;
 };
diff --git a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
index 423098ba44f..ecb4a74a1ef 100644
--- a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp
@@ -6,6 +6,8 @@
 
 #define COMPILE_FOR_ERISC
 
+#include <cstdint>
+
 #include "llrt/hal.hpp"
 #include "llrt/wormhole/wh_hal.hpp"
 #include "hw/inc/wormhole/core_config.h"
@@ -29,7 +31,7 @@ HalCoreInfoType create_active_eth_mem_map() {
     std::vector<DeviceAddr> mem_map_bases;
 
     mem_map_bases.resize(utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::COUNT));
-    mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::BARRIER)] = MEM_L1_BARRIER;
+    mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::BARRIER)] = eth_l1_mem::address_map::ERISC_BARRIER_BASE;
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::MAILBOX)] = eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE;
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch);
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::WATCHER)] = GET_ETH_MAILBOX_ADDRESS_HOST(watcher);
@@ -40,10 +42,11 @@ HalCoreInfoType create_active_eth_mem_map() {
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::CORE_INFO)] = GET_ETH_MAILBOX_ADDRESS_HOST(core_info);
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = GET_ETH_MAILBOX_ADDRESS_HOST(go_message);
     mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr);
+    mem_map_bases[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::FW_VERSION_ADDR)] = eth_l1_mem::address_map::FW_VERSION_ADDR;
 
     std::vector<uint32_t> mem_map_sizes;
     mem_map_sizes.resize(utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::COUNT));
-    mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::BARRIER)] = sizeof(uint32_t);
+    mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::BARRIER)] = eth_l1_mem::address_map::ERISC_BARRIER_SIZE;
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::MAILBOX)] = eth_l1_mem::address_map::ERISC_MEM_MAILBOX_SIZE;
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH)] = sizeof(launch_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::WATCHER)] = sizeof(watcher_msg_t);
@@ -53,6 +56,7 @@ HalCoreInfoType create_active_eth_mem_map() {
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::UNRESERVED)] = eth_l1_mem::address_map::MAX_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t);
     mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t);
+    mem_map_sizes[utils::underlying_type<HalL1MemAddrType>(HalL1MemAddrType::FW_VERSION_ADDR)] = sizeof(std::uint32_t);
 
     std::vector<std::vector<uint8_t>> processor_classes(NumEthDispatchClasses);
     std::vector<uint8_t> processor_types{0};
diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd
index 718132f4503..baed1c889ff 160000
--- a/tt_metal/third_party/umd
+++ b/tt_metal/third_party/umd
@@ -1 +1 @@
-Subproject commit 718132f450313cc08073f32494adcffa94fc1725
+Subproject commit baed1c889ff63e4d84b23d6d8cb3ad24a2390384
diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
index fd207c920a0..1124375431c 100644
--- a/ttnn/CMakeLists.txt
+++ b/ttnn/CMakeLists.txt
@@ -199,6 +199,14 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp
diff --git a/ttnn/cpp/ttnn/operation.hpp b/ttnn/cpp/ttnn/operation.hpp
index e20faf18fdf..267571da734 100644
--- a/ttnn/cpp/ttnn/operation.hpp
+++ b/ttnn/cpp/ttnn/operation.hpp
@@ -287,6 +287,22 @@ constexpr bool implements_validate_with_output_tensors_and_optional_input_tensor
         const OptionalTensors&>;                          // optional output_tensors
 }
 
+template <class T, class... Args>
+using has_compute_output_shapes_t = decltype(std::declval<T>().compute_output_shapes(std::declval<Args>()...));
+
+template <class T>
+constexpr bool implements_compute_output_shapes() {
+    return std::experimental::is_detected_v<has_compute_output_shapes_t, T, const Tensors&>;
+}
+
+template <class T, class... Args>
+using has_compute_output_specs_t = decltype(std::declval<T>().compute_output_specs(std::declval<Args>()...));
+
+template <class T>
+constexpr bool implements_compute_output_specs() {
+    return std::experimental::is_detected_v<has_compute_output_specs_t, T, const Tensors&>;
+}
+
 template <class T, class... Args>
 using has_create_output_tensors_t = decltype(std::declval<T>().create_output_tensors(std::declval<Args>()...));
 
@@ -378,13 +394,41 @@ constexpr bool implements_get_parallelization_strategy() {
     return std::experimental::is_detected_v<has_get_parallelization_strategy_t, T, const Tensors&>;
 }
 
+template <typename ConcreteOperation>
+auto default_create_output_tensors(
+    const ConcreteOperation& operation,
+    const Tensors& input_tensors,
+    const OptionalTensors& optional_output_tensors) -> ProgramOutputTensors<ConcreteOperation> {
+    using OutputTensors = ProgramOutputTensors<ConcreteOperation>;
+    OutputTensors output_tensors;
+
+    if (!optional_output_tensors.empty() and optional_output_tensors[0].has_value()) {
+        output_tensors.reserve(optional_output_tensors.size());
+        for (const auto& optional_output_tensor : optional_output_tensors) {
+            TT_FATAL(optional_output_tensor.has_value(), "If using optional output tensors, all output tensors must have a value");
+            output_tensors.emplace_back(optional_output_tensor.value());
+        }
+        return output_tensors;
+    }
+    const auto& device = input_tensors.at(0).device();
+    const auto& output_specs = operation.compute_output_specs(input_tensors);
+    output_tensors.reserve(output_specs.size());
+    for (const auto& [output_shape, output_layout] : output_specs) {
+        output_tensors.emplace_back(create_device_tensor(
+            output_shape,
+            output_layout,
+            device));
+    }
+    return output_tensors;
+}
+
 }  // namespace detail
 
 template <class OutputTensorsT = Tensors>
 struct DeviceOperation final {
     using storage_t = std::array<std::byte, 1152>;
     using OutputTensors = OutputTensorsT;
-    using ComputedShapes = std::variant<std::vector<tt::tt_metal::LegacyShape>, std::vector<ttnn::SimpleShape>>;
+    using ComputedShapes = std::variant<std::vector<tt::tt_metal::LegacyShape>, std::vector<ttnn::SimpleShape>, std::vector<ttnn::TensorSpec>>;
 
     inline const std::string get_type_name() const { return this->get_type_name_impl_(this->type_erased_storage); }
 
@@ -396,6 +440,7 @@ struct DeviceOperation final {
             this->type_erased_storage, input_tensors, optional_input_tensors, optional_output_tensors);
     }
 
+    // TODO: Rename into compute_output_specs in later PR
     inline const ComputedShapes compute_output_shapes(const Tensors& input_tensors) const {
         return this->compute_output_shapes_impl_(this->type_erased_storage, input_tensors);
     }
@@ -502,14 +547,6 @@ struct DeviceOperation final {
                     static_assert(
                         tt::stl::concepts::always_false_v<T>,
                         "You cannot implement both validate and validate_with_output_tensors");
-                } else if constexpr (
-                    (detail::implements_validate_with_output_tensors<T>() or
-                     detail::implements_validate_with_output_tensors_and_optional_input_tensors<T>()) and
-                    not detail::implements_create_output_tensors_with_optional_output_tensors<T>()) {
-                    static_assert(
-                        tt::stl::concepts::always_false_v<T>,
-                        "Operation doesn't implement create_output_tensors with ant optional output tensors argument "
-                        "when using validate_with_output_tensors");
                 } else if constexpr (detail::implements_validate<T>() and not detail::implements_create_program<T>()) {
                     static_assert(
                         tt::stl::concepts::always_false_v<T>,
@@ -547,7 +584,19 @@ struct DeviceOperation final {
         compute_output_shapes_impl_{
             [](const storage_t& storage, const Tensors& input_tensors) -> const ComputedShapes {
                 const auto& operation = *reinterpret_cast<const std::decay_t<T>*>(&storage);
-                return operation.compute_output_shapes(input_tensors);
+                if constexpr (detail::implements_compute_output_shapes<T>() and detail::implements_compute_output_specs<T>()) {
+                    static_assert(
+                        tt::stl::concepts::always_false_v<T>,
+                        "Operation cannot implement both compute_output_shapes and compute_output_specs");
+                } else if constexpr (detail::implements_compute_output_shapes<T>()) {
+                    return operation.compute_output_shapes(input_tensors);
+                } else if constexpr (detail::implements_compute_output_specs<T>()) {
+                    return operation.compute_output_specs(input_tensors);
+                } else {
+                    static_assert(
+                        tt::stl::concepts::always_false_v<T>,
+                        "Operation must implement either compute_output_shapes or compute_output_specs");
+                }
             }},
         create_output_tensors_impl_{
             [](const storage_t& storage,
@@ -555,9 +604,21 @@ struct DeviceOperation final {
                const OptionalTensors& output_tensors) -> const OutputTensors {
                 const auto& operation = *reinterpret_cast<const std::decay_t<T>*>(&storage);
                 if constexpr (detail::implements_create_output_tensors_with_optional_output_tensors<T>()) {
+                    static_assert(
+                        detail::implements_compute_output_shapes<T>(),
+                        "Operation must implement compute_output_shapes if it implements create_output_tensors");
                     return operation.create_output_tensors(input_tensors, output_tensors);
-                } else {
+                } else if constexpr (detail::implements_create_output_tensors<T>()) {
+                    static_assert(
+                        detail::implements_compute_output_shapes<T>(),
+                        "Operation must implement compute_output_shapes if it implements create_output_tensors");
                     return operation.create_output_tensors(input_tensors);
+                } else if constexpr (detail::implements_compute_output_specs<T>()) {
+                    return detail::default_create_output_tensors(operation, input_tensors, output_tensors);
+                } else {
+                    static_assert(
+                        tt::stl::concepts::always_false_v<T>,
+                        "Operation must implement either create_output_tensors or compute_output_specs");
                 }
             }},
         create_program_impl_{
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
index 386a76fef93..8ded07491f0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
@@ -82,11 +82,11 @@ void SliceDeviceOperation::validate_with_output_tensors(
         TT_FATAL(this->slice_start[i] <= this->slice_end[i], "Error");
     }
     if(!output_tensors.empty() && output_tensors[0].has_value()){
-        const auto output_shape_required = this->compute_output_shapes(input_tensors)[0];
+        const auto output_shape_required = std::get<0>(this->compute_output_specs(input_tensors)[0]);
         const auto& out_tensor = output_tensors[0].value();
-        TT_FATAL(out_tensor.get_legacy_shape() == output_shape_required, "The input tensors need a shape of {}, however the output tensor is only {}", output_shape_required,  out_tensor.get_legacy_shape());
+        TT_FATAL(out_tensor.get_padded_shape() == output_shape_required, "The input tensors need a shape of {}, however the output tensor is only {}", output_shape_required,  out_tensor.get_padded_shape());
     }
-    auto output_tensor_shape = this->compute_output_shapes(input_tensors)[0];
+    auto output_tensor_shape = std::get<0>(this->compute_output_specs(input_tensors)[0]);
     if (has_step) { // if all ones modify before passing in to function
         TT_FATAL(input_tensor_a.get_layout() == Layout::ROW_MAJOR, "Strided slice is only supported for row major layout");
         TT_FATAL(!input_tensor_a.is_sharded(), "Strided slice is not supported for sharded tensor");
@@ -117,40 +117,18 @@ void SliceDeviceOperation::validate_with_output_tensors(
     }
 }
 
-std::vector<tt::tt_metal::LegacyShape> SliceDeviceOperation::compute_output_shapes(const std::vector<Tensor> &input_tensors) const {
-    SmallVector<uint32_t> out_shape;
-    auto rank = input_tensors[0].get_legacy_shape().rank();
-    out_shape.reserve(rank);
+std::vector<ttnn::TensorSpec> SliceDeviceOperation::compute_output_specs(const std::vector<Tensor> &input_tensors) const {
+    const auto& input_tensor = input_tensors[0];
+    SmallVector<uint32_t> out_shape(input_tensor.get_logical_shape().rank());
 
     auto output_dim_i = [this] (size_t i) {
         return (this->slice_end[i] - this->slice_start[i] + this->step[i] - 1) / this->step[i];
     };
-    for (uint32_t i = 0; i < rank; i++) {
-        out_shape.push_back(output_dim_i(i));
-    }
-    tt::tt_metal::LegacyShape output_tensor_shape(out_shape);
-    return {output_tensor_shape};
-}
-
-std::vector<Tensor> SliceDeviceOperation::create_output_tensors(
-    const std::vector<Tensor> &input_tensors, const std::vector<std::optional<Tensor>> &output_tensors) const {
-    if (!output_tensors.empty() && output_tensors[0].has_value()) {
-        return {output_tensors[0].value()};
-    }
-    const auto &input_tensor_a = input_tensors.at(0);
-    const auto shapes = compute_output_shapes(input_tensors);
-
-    if (input_tensor_a.is_sharded()) {
-        return {create_device_tensor(
-            shapes[0],
-            input_tensor_a.get_dtype(),
-            input_tensor_a.get_layout(),
-            input_tensor_a.device(),
-            this->output_mem_config)};
-    } else {
-        return operation::generic_create_output_tensors(
-            *this, input_tensors, input_tensor_a.get_dtype(), input_tensor_a.get_layout(), this->output_mem_config);
+    for (uint32_t i = 0; i < out_shape.size(); i++) {
+        out_shape[i] = output_dim_i(i);
     }
+    ttnn::SimpleShape output_tensor_shape(std::move(out_shape));
+    return {ttnn::TensorSpec(output_tensor_shape, tt::tt_metal::TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), this->output_mem_config))};
 }
 
 operation::ProgramWithCallbacks SliceDeviceOperation::create_program(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp
index 5fdd6922cc9..a663db54a45 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp
@@ -22,8 +22,7 @@ struct SliceDeviceOperation {
 
 
     void validate_with_output_tensors(const std::vector<Tensor>& input_tensors, const std::vector<std::optional<Tensor>>& output_tensors) const;
-    std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
-    std::vector<Tensor> create_output_tensors(const std::vector<Tensor>& input_tensors, const std::vector<std::optional<Tensor>>& output_tensors) const;
+    std::vector<ttnn::TensorSpec> compute_output_specs(const std::vector<Tensor>& input_tensors) const;
     operation::ProgramWithCallbacks create_program(
         const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp
index 2c420128583..6dad529d179 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp
@@ -77,72 +77,57 @@ void Transpose::validate(const std::vector<Tensor> &input_tensors) const {
 }
 
 
-std::vector<tt::tt_metal::LegacyShape> Transpose::compute_output_shapes(const std::vector<Tensor> &input_tensors) const {
+std::vector<ttnn::TensorSpec> Transpose::compute_output_specs(const std::vector<Tensor> &input_tensors) const {
     const auto& input_tensor = input_tensors.at(0);
-    auto out_shape = input_tensor.get_legacy_shape();
-    auto padding = out_shape.padding();
+
+    // TODO: Remove usage of input/output padded shape
+    // - Get output alignment from input alignment and output dtype, layout, mem_config
+    // - Get shard spec from output strides (logical shape + alignment)?
+    auto output_shape = input_tensor.get_logical_shape();
+    auto output_padded_shape = input_tensor.get_padded_shape();
+
     switch (this->dim){
         case TransposeOpDim::CN:
-            std::swap(out_shape[0], out_shape[1]);
-            std::swap(padding[0], padding[1]);
+            std::swap(output_shape[0], output_shape[1]);
+            std::swap(output_padded_shape[0], output_padded_shape[1]);
             break;
         case TransposeOpDim::HC:
-            std::swap(out_shape[1], out_shape[2]);
-            std::swap(padding[1], padding[2]);
+            std::swap(output_shape[1], output_shape[2]);
+            std::swap(output_padded_shape[1], output_padded_shape[2]);
             break;
         case TransposeOpDim::WH:
-            std::swap(out_shape[2], out_shape[3]);
-            std::swap(padding[2], padding[3]);
+            std::swap(output_shape[2], output_shape[3]);
+            std::swap(output_padded_shape[2], output_padded_shape[3]);
             break;
         case TransposeOpDim::NH:
-            std::swap(out_shape[0], out_shape[2]);
-            std::swap(padding[0], padding[2]);
+            std::swap(output_shape[0], output_shape[2]);
+            std::swap(output_padded_shape[0], output_padded_shape[2]);
             break;
         case TransposeOpDim::NW:
-            std::swap(out_shape[0], out_shape[3]);
-            std::swap(padding[0], padding[3]);
+            std::swap(output_shape[0], output_shape[3]);
+            std::swap(output_padded_shape[0], output_padded_shape[3]);
             break;
         case TransposeOpDim::CW:
-            std::swap(out_shape[1], out_shape[3]);
-            std::swap(padding[1], padding[3]);
+            std::swap(output_shape[1], output_shape[3]);
+            std::swap(output_padded_shape[1], output_padded_shape[3]);
             break;
     }
-    return {tt::tt_metal::LegacyShape(out_shape, padding)};
-}
 
-
-std::vector<Tensor> Transpose::create_output_tensors(const std::vector<Tensor> &input_tensors) const {
-    const auto& input_tensor = input_tensors.at(0);
-    // This is only for WH
+    auto output_mem_config = this->output_mem_config;
     if (this->output_mem_config.is_sharded()) {
         if (this->dim == TransposeOpDim::WH) {
+            const auto& input_padded_shape = input_tensor.get_padded_shape();
             ShardSpec shard_spec = input_tensor.shard_spec().value();
-            shard_spec.shape[0] = shard_spec.shape[0] / input_tensor.get_legacy_shape()[-2] * input_tensor.get_legacy_shape()[-1];
-            shard_spec.shape[1] = input_tensor.get_legacy_shape()[-2];
-            const auto output_shape = this->compute_output_shapes(input_tensors)[0];
-            auto mem_config = this->output_mem_config;
-            mem_config.shard_spec = shard_spec;
-            return {create_device_tensor(
-                output_shape,
-                input_tensor.get_dtype(),
-                input_tensor.get_layout(),
-                input_tensor.device(),
-                mem_config)};
+            shard_spec.shape[0] = shard_spec.shape[0] / input_padded_shape[-2] * input_padded_shape[-1];
+            shard_spec.shape[1] = input_padded_shape[-2];
+            output_mem_config.shard_spec = shard_spec;
         } else if (this->dim == TransposeOpDim::HC) {
-            const auto output_shape = this->compute_output_shapes(input_tensors)[0];
-            auto mem_config = this->output_mem_config;
-            mem_config.shard_spec = input_tensor.shard_spec().value();
-            return {create_device_tensor(
-                output_shape,
-                input_tensor.get_dtype(),
-                input_tensor.get_layout(),
-                input_tensor.device(),
-                mem_config)};
+            output_mem_config.shard_spec = input_tensor.shard_spec().value();
         } else {
             TT_ASSERT(false, "Unsupported sharding");
         }
     }
-    return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), input_tensor.get_layout(), this->output_mem_config);
+    return {ttnn::TensorSpec(output_shape, TensorLayout::fromLegacyPaddedShape(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), output_mem_config, ttnn::Shape(output_shape.view(), output_padded_shape.view())))};
 }
 
 operation::ProgramWithCallbacks Transpose::create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.hpp
index 776509a6c80..44979d114e0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.hpp
@@ -23,8 +23,7 @@ struct Transpose {
     const MemoryConfig output_mem_config;
 
     void validate(const std::vector<Tensor> &input_tensors) const;
-    std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
-    std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
+    std::vector<ttnn::TensorSpec> compute_output_specs(const std::vector<Tensor> &input_tensors) const;
     operation::ProgramWithCallbacks create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const;
     TransposeOpParallelizationStrategy get_parallelization_strategy(const std::vector<Tensor> &input_tensors) const;
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
index 9626a678f6a..5033425fd38 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
@@ -81,7 +81,7 @@ void bind_primitive_binary_operation(py::module& module, const binary_operation_
 
 
 template <typename binary_operation_t>
-void bind_binary_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& info=". ", const std::string& note=" ") {
+void bind_binary_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& info=". ", const std::string& supported_dtype="BFLOAT16", const std::string& note=" ") {
     auto doc = fmt::format(
         R"doc(
         {2}
@@ -106,7 +106,19 @@ void bind_binary_operation(py::module& module, const binary_operation_t& operati
         Supports broadcasting.
 
         Note:
-            {5}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {5}
+                 - TILE
+                 - 2, 3, 4
+
+            {6}
 
         Example:
 
@@ -119,6 +131,7 @@ void bind_binary_operation(py::module& module, const binary_operation_t& operati
         description,
         math,
         info,
+        supported_dtype,
         note);
 
     bind_registered_operation(
@@ -173,7 +186,7 @@ void bind_binary_operation(py::module& module, const binary_operation_t& operati
 }
 
 template <typename binary_operation_t>
-void bind_binary_composite(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& note="") {
+void bind_binary_composite(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype = "BFLAOT16", const std::string& note="") {
     auto doc = fmt::format(
         R"doc(
         {2}
@@ -192,7 +205,19 @@ void bind_binary_composite(py::module& module, const binary_operation_t& operati
             ttnn.Tensor: the output tensor.
 
         Note:
-            {4}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {4}
+                 - TILE
+                 - 2, 3, 4
+
+            {5}
 
         Example:
             >>> tensor1 = ttnn.to_device(ttnn.from_torch(torch.tensor(([[1, 2], [3, 4]]), dtype=torch.bfloat16)), device=device)
@@ -205,6 +230,7 @@ void bind_binary_composite(py::module& module, const binary_operation_t& operati
         operation.python_fully_qualified_name(),
         description,
         math,
+        supported_dtype,
         note);
 
     bind_registered_operation(
@@ -471,7 +497,7 @@ void bind_div(py::module& module, const binary_operation_t& operation, const std
 }
 
 template <typename binary_operation_t>
-void bind_polyval(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& note=" ") {
+void bind_polyval(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype="BFLOAT16", const std::string& note=" ") {
     auto doc = fmt::format(
         R"doc(
         {2}
@@ -490,7 +516,19 @@ void bind_polyval(py::module& module, const binary_operation_t& operation, const
             ttnn.Tensor: the output tensor.
 
         Note:
-            {4}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {4}
+                 - TILE
+                 - 2, 3, 4
+
+            {5}
 
         Example:
             >>> tensor = ttnn.to_device(ttnn.from_torch(torch.tensor(([[1, 2], [3, 4]]), dtype=torch.bfloat16)), device=device)
@@ -503,6 +541,7 @@ void bind_polyval(py::module& module, const binary_operation_t& operation, const
         operation.python_fully_qualified_name(),
         description,
         math,
+        supported_dtype,
         note);
 
     bind_registered_operation(
@@ -577,7 +616,7 @@ void bind_binary_overload_operation(py::module& module, const binary_operation_t
 }
 
 template <typename binary_operation_t>
-void bind_inplace_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math) {
+void bind_inplace_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype="BFLOAT16", const std::string& note="") {
     auto doc = fmt::format(
         R"doc(
             {2}
@@ -592,6 +631,21 @@ void bind_inplace_operation(py::module& module, const binary_operation_t& operat
             Returns:
                ttnn.Tensor: the output tensor.
 
+            Note:
+                Supported dtypes, layouts, and ranks:
+
+                .. list-table::
+                   :header-rows: 1
+
+                   * - Dtypes
+                     - Layouts
+                     - Ranks
+                   * - {4}
+                     - TILE
+                     - 2, 3, 4
+
+                {5}
+
             Example:
                 >>> tensor = ttnn.from_torch(torch.tensor(([[1, 2], [3, 4]]), dtype=torch.bfloat16), device=device)
                 >>> output = {1}(tensor1, tensor2)
@@ -599,7 +653,9 @@ void bind_inplace_operation(py::module& module, const binary_operation_t& operat
         operation.base_name(),
         operation.python_fully_qualified_name(),
         description,
-        math);
+        math,
+        supported_dtype,
+        note);
 
     bind_registered_operation(
         module,
@@ -626,7 +682,7 @@ void bind_inplace_operation(py::module& module, const binary_operation_t& operat
 }
 
 template <typename binary_operation_t>
-void bind_logical_inplace_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& note=" ") {
+void bind_logical_inplace_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype = "BFLOAT16", const std::string& note=" ") {
     auto doc = fmt::format(
         R"doc(
         {2}
@@ -642,7 +698,19 @@ void bind_logical_inplace_operation(py::module& module, const binary_operation_t
             ttnn.Tensor: the output tensor.
 
         Note:
-            {4}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {4}
+                 - TILE
+                 - 2, 3, 4
+
+            {5}
 
         Example:
             >>> tensor1 = ttnn.from_torch(torch.tensor(([[1, 2], [3, 4]]), dtype=torch.bfloat16), device=device)
@@ -653,6 +721,7 @@ void bind_logical_inplace_operation(py::module& module, const binary_operation_t
         operation.python_fully_qualified_name(),
         description,
         math,
+        supported_dtype,
         note);
 
     bind_registered_operation(
@@ -731,7 +800,7 @@ void py_module(py::module& module) {
         ttnn::add,
         R"doc(Adds :attr:`input_tensor_a` to :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
         R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i + \mathrm{{input\_tensor\_b}}_i))doc",
-        R"doc(: :code:`'None'` | :code:`'relu'`. )doc");
+        R"doc(: :code:`'None'` | :code:`'relu'`. )doc", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_inplace_operation(
         module,
@@ -745,14 +814,7 @@ void py_module(py::module& module) {
         R"doc(Subtracts :attr:`input_tensor_b` from :attr:`input_tensor_a` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
         R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i - \mathrm{{input\_tensor\_b}}_i))doc",
         R"doc(: :code:`'None'` | :code:`'relu'`. )doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_inplace_operation(
         module,
@@ -765,15 +827,7 @@ void py_module(py::module& module) {
         ttnn::multiply,
         R"doc(Multiplies :attr:`input_tensor_a` by :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
         R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i * \mathrm{{input\_tensor\_b}}_i))doc",
-        R"doc(: :code:`'None'` | :code:`'relu'`. )doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(: :code:`'None'` | :code:`'relu'`. )doc", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_inplace_operation(
         module,
@@ -821,91 +875,37 @@ void py_module(py::module& module) {
         module,
         ttnn::logical_and,
         R"doc(Compute logical AND of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i \& \mathrm{{input\_tensor\_b}}_i))doc",". ",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i \& \mathrm{{input\_tensor\_b}}_i))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_operation(
         module,
         ttnn::logical_or,
         R"doc(Compute logical OR of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i | \mathrm{{input\_tensor\_b}}_i))doc",". ",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i | \mathrm{{input\_tensor\_b}}_i))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_operation(
         module,
         ttnn::ldexp,
         R"doc(Compute ldexp of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc(\mathrm{{output\_tensor}} = ldexp(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(\mathrm{{output\_tensor}} = ldexp(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_operation(
         module,
         ttnn::logaddexp,
         R"doc(Compute logaddexp of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc(\mathrm{{output\_tensor}} = logaddexp(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(\mathrm{{output\_tensor}} = logaddexp(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_operation(
         module,
         ttnn::logaddexp2,
         R"doc(Compute logaddexp2 of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc(\mathrm{{output\_tensor}} = logaddexp2(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(\mathrm{{output\_tensor}} = logaddexp2(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_operation(
         module,
         ttnn::squared_difference,
         R"doc(Compute squared difference of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc(\mathrm{{output\_tensor}} = \verb|squared_difference|(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc", ". ",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(\mathrm{{output\_tensor}} = \verb|squared_difference|(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc", ". ", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_operation(
         module,
@@ -939,14 +939,6 @@ void py_module(py::module& module) {
         ttnn::xlogy,
         R"doc(Compute xlogy :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
         R"doc(\mathrm{output\_tensor}_i = \mathrm{input\_tensor\_a}_i \cdot \log(\mathrm{input\_tensor\_b}_i)
-        )doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16                |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
         )doc");
 
     detail::bind_binary_composite(
@@ -973,59 +965,25 @@ void py_module(py::module& module) {
         ttnn::logical_xor,
         R"doc(Compute logical_xor :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
         R"doc(\mathrm{output\_tensor}_i = (\mathrm{input\_tensor\_a}_i \land \lnot \mathrm{input\_tensor\_b}_i) \lor (\lnot \mathrm{input\_tensor\_a}_i \land \mathrm{input\_tensor\_b}_i))doc",".",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_logical_inplace_operation(
         module,
         ttnn::logical_or_,
         R"doc(Compute inplace logical OR of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc((\mathrm{{input\_tensor\_a}}_i | \mathrm{{input\_tensor\_b}}_i))doc",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc((\mathrm{{input\_tensor\_a}}_i | \mathrm{{input\_tensor\_b}}_i))doc",R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_logical_inplace_operation(
         module,
         ttnn::logical_xor_,
         R"doc(Compute inplace logical XOR of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc((\mathrm{input\_tensor\_a}_i \land \lnot \mathrm{input\_tensor\_b}_i) \lor (\lnot \mathrm{input\_tensor\_a}_i \land \mathrm{input\_tensor\_b}_i))doc",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc((\mathrm{input\_tensor\_a}_i \land \lnot \mathrm{input\_tensor\_b}_i) \lor (\lnot \mathrm{input\_tensor\_a}_i \land \mathrm{input\_tensor\_b}_i))doc", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_logical_inplace_operation(
         module,
         ttnn::logical_and_,
         R"doc(Compute inplace logical AND of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
-        R"doc((\mathrm{{input\_tensor\_a}}_i \& \mathrm{{input\_tensor\_b}}_i))doc",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-        )doc");
+        R"doc((\mathrm{{input\_tensor\_a}}_i \& \mathrm{{input\_tensor\_b}}_i))doc", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_composite(
         module,
@@ -1105,16 +1063,7 @@ void py_module(py::module& module) {
         ttnn::polyval,
         R"doc(Compute polyval of all elements of :attr:`input_tensor_a` with coefficient :attr:`coeffs` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
         R"doc(\mathrm{output\_tensor} = \sum_{i=0}^{n} (\mathrm{coeffs}_i) (\mathrm{input\_tensor}^i)
-        )doc",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-            +----------------------------+---------------------------------+-------------------+
-            |     Dtypes                 |         Layouts                 |     Ranks         |
-            +----------------------------+---------------------------------+-------------------+
-            |    BFLOAT16, BFLOAT8_B     |          TILE                   |      2, 3, 4      |
-            +----------------------------+---------------------------------+-------------------+
-            )doc");
+        )doc",R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_overload_operation(
         module,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp
index 22e09bdaac3..b99897762e7 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp
@@ -22,7 +22,7 @@ namespace binary_backward {
 namespace detail {
 
 template <typename binary_backward_operation_t>
-void bind_binary_backward_ops(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_backward_ops(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
         {2}
@@ -39,10 +39,19 @@ void bind_binary_backward_ops(py::module& module, const binary_backward_operatio
             List of ttnn.Tensor: the output tensor.
 
         Note:
-            {3}
+            Supported dtypes, layouts, and ranks:
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+            .. list-table::
+               :header-rows: 1
 
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {3}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
         Example:
             >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device=device)
@@ -79,7 +88,7 @@ void bind_binary_backward_ops(py::module& module, const binary_backward_operatio
 }
 
 template <typename binary_backward_operation_t>
-void bind_binary_backward_int_default(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, int parameter_value, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_backward_int_default(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, int parameter_value, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
         {5}
@@ -104,9 +113,19 @@ void bind_binary_backward_int_default(py::module& module, const binary_backward_
 
 
         Note:
-            {6}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {6}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
 
         Example:
@@ -157,7 +176,7 @@ void bind_binary_backward_int_default(py::module& module, const binary_backward_
 }
 
 template <typename binary_backward_operation_t>
-void bind_binary_backward_opt_float_default(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, float parameter_value, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_backward_opt_float_default(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, float parameter_value, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
         {5}
@@ -183,9 +202,19 @@ void bind_binary_backward_opt_float_default(py::module& module, const binary_bac
 
 
         Note:
-            {6}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {6}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
 
         Example:
@@ -246,7 +275,7 @@ void bind_binary_backward_float_string_default(
     const std::string& parameter_b_doc,
     string parameter_b_value,
     const std::string& description,
-    const std::string& supported_dtype) {
+    const std::string& supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
 
@@ -265,9 +294,19 @@ void bind_binary_backward_float_string_default(
             List of ttnn.Tensor: the output tensor.
 
         Note:
-            {8}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {8}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
         Example:
 
@@ -323,7 +362,7 @@ void bind_binary_backward_float_string_default(
 }
 
 template <typename binary_backward_operation_t>
-void bind_binary_backward_sub_alpha(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, float parameter_value, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_backward_sub_alpha(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, float parameter_value, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
 
@@ -343,9 +382,19 @@ void bind_binary_backward_sub_alpha(py::module& module, const binary_backward_op
             queue_id (int, optional): command queue id. Defaults to `0`.
 
         Note:
-            {6}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {6}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
         Example:
 
@@ -394,7 +443,7 @@ void bind_binary_backward_sub_alpha(py::module& module, const binary_backward_op
 }
 
 template <typename binary_backward_operation_t>
-void bind_binary_backward_rsub(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_backward_rsub(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
 
     auto doc = fmt::format(
         R"doc(
@@ -414,9 +463,19 @@ void bind_binary_backward_rsub(py::module& module, const binary_backward_operati
             queue_id (int, optional): command queue id. Defaults to `0`.
 
         Note:
-            {3}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {3}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
         Example:
 
@@ -460,7 +519,7 @@ void bind_binary_backward_rsub(py::module& module, const binary_backward_operati
 }
 
 template <typename binary_backward_operation_t>
-void bind_binary_bw_mul(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_bw_mul(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
         {2}
@@ -481,9 +540,19 @@ void bind_binary_bw_mul(py::module& module, const binary_backward_operation_t& o
             List of ttnn.Tensor: the output tensor.
 
         Note:
-            {3}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {3}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
         Example:
             >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device)
@@ -561,7 +630,7 @@ void bind_binary_bw_mul(py::module& module, const binary_backward_operation_t& o
 
 
 template <typename binary_backward_operation_t>
-void bind_binary_bw(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_bw(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
 
@@ -581,9 +650,19 @@ void bind_binary_bw(py::module& module, const binary_backward_operation_t& opera
             queue_id (int, optional): command queue id. Defaults to `0`.
 
         Note:
-            {3}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {3}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
         Example:
             >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device)
@@ -662,7 +741,7 @@ void bind_binary_bw(py::module& module, const binary_backward_operation_t& opera
 }
 
 template <typename binary_backward_operation_t>
-void bind_binary_bw_div(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_bw_div(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
 
@@ -687,9 +766,19 @@ void bind_binary_bw_div(py::module& module, const binary_backward_operation_t& o
         Supports broadcasting.
 
         Note:
-            {3}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
 
-        Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {3}
+                 - TILE
+                 - 2, 3, 4
+
+            bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT
 
         Example:
             >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device)
@@ -770,7 +859,7 @@ void bind_binary_bw_div(py::module& module, const binary_backward_operation_t& o
 }
 
 template <typename binary_backward_operation_t>
-void bind_binary_backward_overload(py::module& module, const binary_backward_operation_t& operation, const std::string& description, const std::string& supported_dtype) {
+void bind_binary_backward_overload(py::module& module, const binary_backward_operation_t& operation, const std::string& description, const std::string& supported_dtype = "BFLOAT16", const std::string& note = "") {
     auto doc = fmt::format(
         R"doc(
 
@@ -788,7 +877,19 @@ void bind_binary_backward_overload(py::module& module, const binary_backward_ope
             List of ttnn.Tensor: the output tensor.
 
         Note:
-            {3}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {3}
+                 - TILE
+                 - 2, 3, 4
+
+            {4}
 
         Example:
 
@@ -800,7 +901,7 @@ void bind_binary_backward_overload(py::module& module, const binary_backward_ope
         operation.base_name(),
         operation.python_fully_qualified_name(),
         description,
-        supported_dtype);
+        supported_dtype, note);
 
     bind_registered_operation(
         module,
@@ -838,7 +939,7 @@ void bind_binary_backward_overload(py::module& module, const binary_backward_ope
 }
 
 template <typename binary_backward_operation_t>
-void bind_binary_backward_assign(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") {
+void bind_binary_backward_assign(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") {
     auto doc = fmt::format(
         R"doc(
 
@@ -858,7 +959,17 @@ void bind_binary_backward_assign(py::module& module, const binary_backward_opera
             round_mode (str, optional): Round mode for the operation. Defaults to `None`.
 
         Note:
-            {3}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {3}
+                 - TILE
+                 - 2, 3, 4
 
         Example:
             >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device)
@@ -924,288 +1035,126 @@ void py_module(py::module& module) {
         module,
         ttnn::mul_bw,
         R"doc(Performs backward operations for multiply on :attr:`input_tensor_a`, :attr:`input_tensor_b`, with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_bw(
         module,
         ttnn::add_bw,
         R"doc(Performs backward operations for add of :attr:`input_tensor_a` and :attr:`input_tensor_b` or :attr:`scalar` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_bw(
         module,
         ttnn::sub_bw,
         R"doc(Performs backward operations for subtract of :attr:`input_tensor_a` and :attr:`input_tensor_b` or :attr:`scalar` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_bw_div(
         module,
         ttnn::div_bw,
         R"doc(Performs backward operations for divide on :attr:`input_tensor`, :attr:`alpha` or :attr:`input_tensor_a`, :attr:`input_tensor_b`, :attr:`round_mode`,  with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_backward_overload(
         module,
         ttnn::remainder_bw,
         R"doc(Performs backward operations for remainder of :attr:`input_tensor_a`, :attr:`scalar` or :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16                |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        supported only in WHB0.
-
-        )doc");
+        R"doc(BFLOAT16)doc", R"doc(Supported only in WHB0.)doc");
 
     detail::bind_binary_backward_overload(
         module,
         ttnn::fmod_bw,
         R"doc(Performs backward operations for fmod of :attr:`input_tensor_a`, :attr:`scalar` or :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16                |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16)doc");
 
     detail::bind_binary_backward_assign(
         module,
         ttnn::assign_bw,
-        R"doc(Performs backward operations for assign of :attr:`input_tensor_a`, :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       ROW_MAJOR, TILE           |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(Performs backward operations for assign of :attr:`input_tensor_a`, :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::atan2_bw,
         R"doc(Performs backward operations for atan2 of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_backward_sub_alpha(
         module,
         ttnn::subalpha_bw,
         "alpha", "Alpha value", 1.0f,
         R"doc(Performs backward operations for subalpha of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_backward_opt_float_default(
         module,
         ttnn::addalpha_bw,
         "alpha", "Alpha value", 1.0f,
         R"doc(Performs backward operations for addalpha on :attr:`input_tensor_b` , :attr:`input_tensor_a` and :attr:`alpha` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::xlogy_bw,
         R"doc(Performs backward operations for xlogy of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16                |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::hypot_bw,
         R"doc(Performs backward operations for hypot of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
-        )doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::ldexp_bw,
         R"doc(Performs backward operations for ldexp of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
+        R"doc(BFLOAT16)doc");
 
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::logaddexp_bw,
         R"doc(Performs backward operations for logaddexp of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
+        R"doc(BFLOAT16)doc");
 
-        )doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::logaddexp2_bw,
         R"doc(Performs backward operations for logaddexp2 of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16)doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::squared_difference_bw,
         R"doc(Performs backward operations for squared_difference of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
 
     detail::bind_binary_backward_int_default(
         module,
         ttnn::concat_bw,
         "dim", "Dimension to concatenate", 0,
         R"doc(Performs backward operations for concat on :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16                |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16)doc");
 
     detail::bind_binary_backward_rsub(
         module,
         ttnn::rsub_bw,
         R"doc(Performs backward operations for subraction of :attr:`input_tensor_a` from :attr:`input_tensor_b` with given :attr:`grad_tensor` (reversed order of subtraction operator).)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::min_bw,
-        R"doc(Performs backward operations for minimum of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16                |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(Performs backward operations for minimum of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc");
 
     detail::bind_binary_backward_ops(
         module,
         ttnn::max_bw,
-        R"doc(Performs backward operations for maximum of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16                |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(Performs backward operations for maximum of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc");
 
     detail::bind_binary_backward_float_string_default(
         module,
@@ -1217,15 +1166,7 @@ void py_module(py::module& module) {
         "none",
         R"doc(Performs backward operations for bias_gelu on :attr:`input_tensor_a` and :attr:`input_tensor_b` or :attr:`input_tensor` and :attr:`bias`, with given :attr:`grad_tensor` using given :attr:`approximate` mode.
         :attr:`approximate` mode can be 'none', 'tanh'.)doc",
-        R"doc(Supported dtypes, layouts, and ranks:
-
-           +----------------------------+---------------------------------+-------------------+
-           |     Dtypes                 |         Layouts                 |     Ranks         |
-           +----------------------------+---------------------------------+-------------------+
-           |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-           +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+        R"doc(BFLOAT16, BFLOAT8_B)doc");
 }
 
 }  // namespace binary_backward
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
index b8f7e3c1c10..255ca459504 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
@@ -149,7 +149,7 @@ void UnaryDeviceOperation::validate_on_program_cache_miss(
 
 shape_return_value_t UnaryDeviceOperation::compute_output_shapes(
     const operation_attributes_t&, const tensor_args_t& tensor_args) {
-    return {tensor_args.input.get_shape()};
+    return {tensor_args.input.get_logical_shape()};
 }
 
 tensor_return_value_t UnaryDeviceOperation::create_output_tensors(
@@ -158,13 +158,12 @@ tensor_return_value_t UnaryDeviceOperation::create_output_tensors(
         return tensor_args.preallocated_output.value();
     }
 
-    const auto output_shape = compute_output_shapes(args, tensor_args);
-
     auto output_layout = Layout::TILE;
     if (args.output_memory_config.is_sharded()) {
         output_layout = tensor_args.input.get_layout();
     }
 
+    const auto output_shape = tensor_args.input.shape();
     return create_device_tensor(
         output_shape, args.output_dtype, output_layout, tensor_args.input.device(), args.output_memory_config);
 }
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp
index f600d7317f5..95d100a9c85 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp
@@ -27,6 +27,6 @@ struct tensor_args_t {
 
 using tensor_return_value_t = Tensor;
 
-using shape_return_value_t = ttnn::Shape;
+using shape_return_value_t = ttnn::SimpleShape;
 
 } // namespace ttnn::operations::unary
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
index 04011c73f1d..4a720efa199 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
@@ -897,7 +897,7 @@ void bind_identity(py::module& module, const unary_operation_t& operation) {
 }
 
 template <typename unary_operation_t>
-void bind_power(py::module& module, const unary_operation_t& operation, const std::string& info_doc = "") {
+void bind_power(py::module& module, const unary_operation_t& operation, const std::string& supported_dtype="BFLOAT16", const std::string& info_doc = "") {
     auto doc = fmt::format(
         R"doc(
         Applies {0} to :attr:`input_tensor` element-wise.
@@ -918,7 +918,19 @@ void bind_power(py::module& module, const unary_operation_t& operation, const st
             ttnn.Tensor: the output tensor.
 
         Note:
-            {2}
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {2}
+                 - TILE
+                 - 2, 3, 4
+
+            {3}
 
         Example:
             >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
@@ -926,6 +938,7 @@ void bind_power(py::module& module, const unary_operation_t& operation, const st
         )doc",
         ttnn::pow.base_name(),
         ttnn::pow.python_fully_qualified_name(),
+        supported_dtype,
         info_doc);
 
     bind_registered_operation(
@@ -1789,16 +1802,7 @@ void py_module(py::module& module) {
     detail::bind_sigmoid_accurate(module, ttnn::sigmoid_accurate);
     detail::bind_unary_chain(module, ttnn::unary_chain);
     detail::bind_identity(module, ttnn::identity);
-    detail::bind_power(module, ttnn::pow,
-        R"doc(Supported dtypes, layouts, and ranks:
-
-            +----------------------------+---------------------------------+-------------------+
-            |     Dtypes                 |         Layouts                 |     Ranks         |
-            +----------------------------+---------------------------------+-------------------+
-            |    BFLOAT16, BFLOAT8_B     |       TILE                      |      2, 3, 4      |
-            +----------------------------+---------------------------------+-------------------+
-
-        )doc");
+    detail::bind_power(module, ttnn::pow, R"doc(BFLOAT16, BFLOAT8_B)doc");
 
     // unary composite imported into ttnn
     detail::bind_unary_composite(module, ttnn::deg2rad, R"doc(Performs deg2rad function on :attr:`input_tensor`.)doc");
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
index 4fd3da1951b..c3ed75dcb6f 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp
@@ -163,7 +163,7 @@ std::vector <ttnn::Tensor> all_gather_matmul(
 
 
             /* Matmul setup */
-            bool user_run_batched = ttnn::operations::matmul::detail::is_input_batched(weight_tensor.get_shape());
+            bool user_run_batched = ttnn::operations::matmul::detail::is_input_batched(weight_tensor.get_logical_shape());
             std::optional<CoreCoord> user_core_coord;
             if (core_grid.has_value()) {
                 user_core_coord = CoreCoord(core_grid->x, core_grid->y);
diff --git a/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp
index 19b097d0b21..8f5c68b23de 100644
--- a/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp
@@ -20,6 +20,8 @@
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.hpp"
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.hpp"
 #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.hpp"
+#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp"
+#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp"
 #include "ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.hpp"
 #include "ttnn/operations/experimental/paged_cache/paged_cache_pybind.hpp"
 #include "ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding_pybind.hpp"
@@ -44,6 +46,8 @@ void py_module(py::module& module) {
     transformer::detail::bind_nlp_concat_heads_decode(module);
     transformer::detail::bind_nlp_create_qkv_heads_decode(module);
     transformer::detail::bind_nlp_create_qkv_heads_falcon7b(module);
+    transformer::detail::bind_nlp_create_qkv_heads_vit(module);
+    transformer::detail::bind_nlp_create_qkv_heads_segformer(module);
     transformer::detail::bind_nlp_kv_cache_load_slice(module);
 
     transformer::py_bind_rotary_embedding(module);
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp
new file mode 100644
index 00000000000..092c948382d
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+
+void kernel_main() {
+    // READER RUNTIME ARGS
+    uint32_t in0_tensor_addr                     = get_arg_val<uint32_t>(0);
+    uint32_t in1_tensor_addr                     = get_arg_val<uint32_t>(1);
+    uint32_t num_blocks                          = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_tile_id                  = get_arg_val<uint32_t>(3);
+    uint32_t in1_tensor_tile_id                  = get_arg_val<uint32_t>(4);
+
+    // COMPILE TIME ARGS
+    // interleaved accessor args
+    constexpr uint32_t in0_is_dram               = get_compile_time_arg_val(0);
+    constexpr uint32_t in1_is_dram               = get_compile_time_arg_val(1);
+    // READER COMPILE TIME ARGS
+    constexpr uint32_t q_num_tiles               = get_compile_time_arg_val(2);
+
+
+    constexpr uint32_t cb_id_qv = 1; // cb for Q, V heads
+
+    constexpr uint32_t onetile = 1;
+    const uint32_t single_tile_size_bytes = get_tile_size(cb_id_qv);
+    const DataFormat data_format = get_dataformat(cb_id_qv);
+
+    constexpr bool in0_is_dram_bool = in0_is_dram == 1;
+    const InterleavedAddrGenFast<in0_is_dram_bool> s0 = {
+        .bank_base_address = in0_tensor_addr,
+        .page_size = single_tile_size_bytes,
+        .data_format = data_format,
+    };
+
+
+    for (uint32_t block = 0; block < num_blocks; block++) {
+        // Q
+        for (uint32_t i = 0; i < q_num_tiles; i++) {
+            cb_reserve_back(cb_id_qv, onetile);
+            uint32_t l1_write_addr = get_write_ptr(cb_id_qv);
+            noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr);
+            noc_async_read_barrier();
+            cb_push_back(cb_id_qv, onetile);
+            in0_tensor_tile_id++;
+        }
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp
new file mode 100644
index 00000000000..e00a56b5c67
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include <array>
+#include "dataflow_api.h"
+
+
+void kernel_main() {
+    // WRITER RUNTIME ARGS
+    uint32_t q_tensor_addr                       = get_arg_val<uint32_t>(0);
+    uint32_t num_blocks                          = get_arg_val<uint32_t>(1);
+    uint32_t q_out_h_dim                         = get_arg_val<uint32_t>(2);
+    uint32_t q_out_tensor_tile_id                = get_arg_val<uint32_t>(3);
+
+    // COMPILE TIME ARGS
+    // interleaved accessor args
+    constexpr uint32_t out_is_dram               = get_compile_time_arg_val(0);
+    constexpr uint32_t q_out_h_tiles             = get_compile_time_arg_val(1);
+    constexpr uint32_t q_out_w_tiles             = get_compile_time_arg_val(2);
+    constexpr uint32_t q_out_HtWt                = get_compile_time_arg_val(3);
+    constexpr uint32_t q_out_c                   = get_compile_time_arg_val(4);
+
+    constexpr uint32_t cb_id_qv = 1; // cb for Q, V heads tiles
+
+    const uint32_t single_tile_size_bytes = get_tile_size(cb_id_qv);
+    const DataFormat data_format = get_dataformat(cb_id_qv);
+
+    constexpr bool out_is_dram_bool = out_is_dram == 1;
+    const InterleavedAddrGenFast<out_is_dram_bool> sq = {
+        .bank_base_address = q_tensor_addr,
+        .page_size = single_tile_size_bytes,
+        .data_format = data_format
+    };
+
+    constexpr uint32_t block_size = 1; // micro-block size for read/write; nothing to do with num_blocks
+    // TODO: This might negatively impact perf
+    constexpr uint32_t out_num_tiles_read = block_size; // always read and pop by micro-block size for generality
+    uint32_t l1_read_addr;
+    uint32_t q_out_tensor_current_tile_id; // need this to update q_out_tensor_tile_id
+    uint32_t out_tensor_current_tile_id_along_c;
+
+    for (uint32_t block = 0; block < num_blocks; block++) {
+        // q + create q head --> outputs: [B, num_q_heads, S, head_dim]
+        out_tensor_current_tile_id_along_c = q_out_tensor_tile_id;
+        for (uint32_t c_dim = 0; c_dim < q_out_c; c_dim++) {
+            q_out_tensor_current_tile_id = out_tensor_current_tile_id_along_c;
+            for (uint32_t w_dim = 0; w_dim < q_out_w_tiles; w_dim++) {
+                cb_wait_front(cb_id_qv, out_num_tiles_read);
+                l1_read_addr = get_read_ptr(cb_id_qv);
+                noc_async_write_tile(q_out_tensor_current_tile_id, sq, l1_read_addr);
+
+                noc_async_write_barrier();
+                cb_pop_front(cb_id_qv, out_num_tiles_read);
+
+                q_out_tensor_current_tile_id++;
+            }
+            out_tensor_current_tile_id_along_c += q_out_HtWt;
+        }
+
+        // Update out_tensor_tile_id for next h_dim or batch if we finish one CHtWt
+        q_out_h_dim++;
+        if (q_out_h_dim < q_out_h_tiles) {
+            q_out_tensor_tile_id += q_out_w_tiles;
+        } else {
+            // If we finish one batch, always roll over to next tile in memory
+            // This is just the current_tile_id, except for K when we transpose heads
+            // In this case, decrement k_out_tensor_current_tile_id by the stride (q_out_h_tiles) and add 1 to roll over
+            q_out_tensor_tile_id = q_out_tensor_current_tile_id;
+            q_out_h_dim = 0;
+        }
+    }
+
+}
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
new file mode 100644
index 00000000000..c3dcffdc410
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
@@ -0,0 +1,57 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "nlp_create_qkv_heads_segformer_device_operation.hpp"
+
+#include "tt_metal/common/work_split.hpp"
+
+namespace ttnn::operations::experimental::transformer {
+
+// Hard-coded for Segformer
+void NlpCreateHeadsSegformerDeviceOperation::validate(const std::vector<Tensor>& input_tensors) const {
+    const auto& input_tensor = input_tensors.at(0);
+    const auto input_shape = input_tensor.get_legacy_shape();
+
+    TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to TM need to be on device!");
+    TT_FATAL(input_tensor.buffer() != nullptr, "Operands to TM need to be allocated in buffers on device!");
+    TT_FATAL(input_tensor.get_dtype() == tt::tt_metal::DataType::FLOAT32 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT16 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT8_B, "Unsupported data format");
+    TT_FATAL(input_tensor.get_layout() == Layout::TILE, "Error");
+
+    TT_FATAL(input_shape[2] % tt::constants::TILE_HEIGHT == 0, "Error");
+    TT_FATAL(input_shape[3] % tt::constants::TILE_HEIGHT == 0, "Error");
+    //TT_FATAL((input_shape == tt::tt_metal::LegacyShape({input_shape[0], 1, input_shape[2], 2304})), "Unsupported input shape");
+    TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::INTERLEAVED, "Error");
+}
+
+std::vector<tt::tt_metal::LegacyShape> NlpCreateHeadsSegformerDeviceOperation::compute_output_shapes(const std::vector<Tensor>& input_tensors) const {
+    std::vector<tt::tt_metal::LegacyShape> output_shape_vec;
+    const auto& input_tensor = input_tensors.at(0);
+    const auto input_shape = input_tensor.get_legacy_shape();
+    const auto head_dim = 32; // head_dim is hard-coded = 32
+    auto num_heads = input_shape[3] / tt::constants::TILE_HEIGHT; // head_dim is hard-coded = 32
+    output_shape_vec = {(tt::tt_metal::LegacyShape) {input_shape[0], num_heads, input_shape[2], head_dim}, (tt::tt_metal::LegacyShape) {input_shape[0], num_heads, input_shape[2], head_dim}, (tt::tt_metal::LegacyShape) {input_shape[0], num_heads, input_shape[2], head_dim}};
+    //output_shape_vec = {(tt::tt_metal::LegacyShape) {input_shape[0], num_heads, input_shape[2], head_dim},  };
+
+    return output_shape_vec;
+}
+
+std::vector<Tensor> NlpCreateHeadsSegformerDeviceOperation::create_output_tensors(const std::vector<Tensor>& input_tensors) const {
+    const auto& input_tensor = input_tensors.at(0);
+    if (this->output_mem_config.is_sharded()) {
+        TT_ASSERT(false);
+        return {};
+    } else {
+        return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), Layout::TILE, this->output_mem_config);
+    }
+}
+
+operation::ProgramWithCallbacks NlpCreateHeadsSegformerDeviceOperation::create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const {
+    const auto& input_tensor = input_tensors.at(0);
+    auto& output_tensor = output_tensors.at(0);
+
+    CoreCoord compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size();
+
+    return multi_core_nlp_create_qkv_heads_segformer(input_tensor, output_tensors, compute_with_storage_grid_size);
+}
+}  // namespace ttnn::operations::experimental::transformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp
new file mode 100644
index 00000000000..66fb3c45953
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <optional>
+
+#include "ttnn/run_operation.hpp"
+#include <variant>
+
+#include "ttnn/common/constants.hpp"
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/device_operation.hpp"
+
+namespace ttnn::operations::experimental::transformer {
+
+operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_segformer(const Tensor &input_tensor_a, std::vector<Tensor> &output, CoreCoord compute_with_storage_grid_size);
+
+struct NlpCreateHeadsSegformerDeviceOperation {
+    MemoryConfig output_mem_config;
+
+    void validate(const std::vector<Tensor>& input_tensors) const;
+    std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
+    std::vector<Tensor> create_output_tensors(const std::vector<Tensor>& input_tensors) const;
+    operation::ProgramWithCallbacks create_program(
+        const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const;
+};
+
+} // namespace ttnn::operations::experimental::transformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp
new file mode 100644
index 00000000000..ef409dae2cf
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp
@@ -0,0 +1,195 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "nlp_create_qkv_heads_segformer_device_operation.hpp"
+#include "tt_metal/common/work_split.hpp"
+
+namespace ttnn::operations::experimental::transformer {
+
+using namespace tt::constants;
+using namespace tt;
+using namespace tt::tt_metal;
+
+
+operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_segformer(const Tensor &a, std::vector<Tensor>& output, CoreCoord compute_with_storage_grid_size) {
+
+    const auto& ashape = a.get_legacy_shape();
+
+    tt_metal::Device *device = a.device();
+
+    tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+
+    uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format);
+    tt_metal::Buffer *in0_buffer = a.buffer();
+    TT_ASSERT(in0_buffer->size() % single_tile_size == 0);
+    // Dummy
+    tt_metal::Buffer *in1_buffer;
+    uint32_t in1_buffer_addr = 0;
+
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      TM Parameters Setup
+    ////////////////////////////////////////////////////////////////////////////
+    const uint32_t head_dim = 32;
+    uint32_t per_tensor_tiles = ashape[3] / TILE_WIDTH ;
+    const uint32_t q_num_tiles_per_tensor = per_tensor_tiles;
+    const uint32_t num_q_heads = q_num_tiles_per_tensor; //hard-coding the head_dim = 32
+
+    // Per output tensor args
+    // Output shape for Q/K/V is: [B, head_num, s, 32] # Needs shuffling from [B, 1, s, hidden_dim]
+    uint32_t q_out_h_tiles = ashape[2] / TILE_WIDTH;
+    uint32_t q_out_w_tiles = 1; //hard-coding the head_dim = 32
+    uint32_t q_out_c = q_num_tiles_per_tensor / q_out_w_tiles; // num_heads
+    uint32_t q_out_HtWt = q_out_h_tiles * q_out_w_tiles;
+    uint32_t q_out_CHtWt = q_out_c * q_out_HtWt;
+    uint32_t q_num_tiles = num_q_heads * q_out_w_tiles;
+
+    uint32_t num_cores_x = compute_with_storage_grid_size.x;
+    uint32_t num_cores_y = compute_with_storage_grid_size.y;
+    // Block is a unit of work; ie. num of per_tensor_tiles per core
+    uint32_t num_blocks = ashape[0] * ashape[1] * ashape[2] / TILE_HEIGHT;
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_blocks_per_core_group_1, num_blocks_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_blocks);
+
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Grayskull Device Setup
+    ////////////////////////////////////////////////////////////////////////////
+    TT_ASSERT((output.size() == 1), "Output vector must be size 1 !");
+    tt_metal::Tensor& q = output[0];
+    tt_metal::Tensor& k = output[1];
+    tt_metal::Tensor& v = output[2];
+
+    tt_metal::Buffer *q_buffer = q.buffer();
+    TT_ASSERT(q_buffer != nullptr, "Output q buffer should be allocated on device!");
+    tt_metal::Buffer *k_buffer = k.buffer();
+    TT_ASSERT(k_buffer != nullptr, "Output k buffer should be allocated on device!");
+    tt_metal::Buffer *v_buffer = v.buffer();
+    TT_ASSERT(v_buffer != nullptr, "Output v buffer should be allocated on device!");
+
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Application Setup
+    ////////////////////////////////////////////////////////////////////////////
+    tt_metal::Program program = tt_metal::CreateProgram();
+
+    bool tile_dtype_is_bfloat16 = a.get_dtype() == tt::tt_metal::DataType::BFLOAT16;
+    bool in0_is_dram = in0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
+    bool out_is_dram = q_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
+    bool in1_is_dram = false;
+
+    std::vector<uint32_t> reader_compile_time_args = {
+            // interleaved accessor args
+            (std::uint32_t) in0_is_dram,
+            (std::uint32_t) in1_is_dram,
+            (std::uint32_t) q_num_tiles,
+    };
+    std::vector<uint32_t> writer_compile_time_args = {
+            // interleaved accessor args
+            (std::uint32_t) out_is_dram,
+            (std::uint32_t) q_out_h_tiles,
+            (std::uint32_t) q_out_w_tiles,
+            (std::uint32_t) q_out_HtWt,
+            (std::uint32_t) num_q_heads, // q_out_c
+    };
+
+    ///////////// K transpose ////////////////////
+    const bool transpose_k_heads = false;
+    std::map<string, string> reader_defines;
+    std::map<string, string> writer_defines;
+
+    //////////////////////////////////////////////
+    auto reader_kernel_id = tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp",
+        all_cores,
+        tt_metal::ReaderDataMovementConfig(reader_compile_time_args, reader_defines));
+
+    auto writer_kernel_id = tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp",
+        all_cores,
+        tt_metal::WriterDataMovementConfig(writer_compile_time_args, writer_defines));
+
+
+
+    // Create circular buffers
+    uint32_t src1_cb_index = 1;
+    uint32_t cb0_num_tiles = per_tensor_tiles * 2; // double buffer
+    tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(cb0_num_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
+		.set_page_size(src1_cb_index, single_tile_size);
+    auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
+
+    for (uint32_t i = 0, num_blocks_written = 0; i < num_cores; i++){
+        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+        uint32_t num_blocks_per_core = 0;
+        if (core_group_1.contains(core)) {
+            num_blocks_per_core = num_blocks_per_core_group_1;
+        } else if (core_group_2.contains(core)) {
+            num_blocks_per_core = num_blocks_per_core_group_2;
+        } else {
+            TT_ASSERT(false, "Core not in specified core ranges");
+        }
+
+        std::vector<uint32_t> reader_runtime_args = {
+            (std::uint32_t) in0_buffer->address(),
+            (std::uint32_t) in1_buffer_addr,
+            num_blocks_per_core,
+            num_blocks_written * per_tensor_tiles,
+            0,
+        };
+
+        uint32_t q_out_h_dim = num_blocks_written % q_out_h_tiles;
+        uint32_t q_out_tensor_tile_id = num_blocks_written / q_out_h_tiles * q_out_CHtWt + q_out_h_dim * q_out_w_tiles;
+
+        std::vector<uint32_t> writer_runtime_args = {
+            (std::uint32_t) q_buffer->address(), // q_tensor_addr
+            num_blocks_per_core, // num_blocks
+            q_out_h_dim,
+            q_out_tensor_tile_id,
+        };
+
+        tt_metal::SetRuntimeArgs(program, reader_kernel_id, core, reader_runtime_args);
+        tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args);
+        num_blocks_written += num_blocks_per_core;
+    }
+
+    auto override_runtime_args_callback = [
+            reader_kernel_id,
+            writer_kernel_id,
+            num_cores,
+            num_cores_y
+        ]
+    (
+        const Program &program,
+        const std::vector<tt::tt_metal::Buffer*>& input_buffers,
+        const std::vector<tt::tt_metal::Buffer*>& output_buffers
+    ) {
+
+        auto src_dram_buffer = input_buffers.at(0);
+
+        auto dst_dram_buffer_query = output_buffers.at(0);
+
+        for (uint32_t i = 0, num_blocks_written = 0; i < num_cores; i++){
+            CoreCoord core = {i / num_cores_y, i % num_cores_y};
+
+            {
+                auto &runtime_args = GetRuntimeArgs(program, reader_kernel_id, core);
+                runtime_args[0] = src_dram_buffer->address();
+            }
+
+            {
+                auto &runtime_args = GetRuntimeArgs(program, writer_kernel_id, core);
+                runtime_args[0] = dst_dram_buffer_query->address();
+            }
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+
+}  // ttnn::operations::experimental::transformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp
new file mode 100644
index 00000000000..176192091a0
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "nlp_create_qkv_heads_segformer.hpp"
+
+namespace ttnn::operations::experimental::transformer {
+
+    std::tuple<ttnn::Tensor, ttnn::Tensor, ttnn::Tensor> NLPCreateHeadsSegformerOperation::invoke (
+        uint8_t queue_id,
+        const Tensor& input_tensor_q,
+        const std::optional<MemoryConfig>& memory_config,
+        std::optional<std::vector<std::optional<Tensor>>> optional_output_tensors) {
+            const MemoryConfig output_mem_config = memory_config.value_or(input_tensor_q.memory_config());
+            auto optional_outputs = std::vector<std::optional<Tensor>>{};
+            if (optional_output_tensors.has_value()) {
+                optional_outputs = {optional_output_tensors.value().begin(), optional_output_tensors.value().end()};
+            }
+            else {
+                optional_outputs = {};
+            }
+            auto outputs = operation::run(NlpCreateHeadsSegformerDeviceOperation{output_mem_config}, {input_tensor_q}, {}, optional_outputs);
+            return {outputs[0], outputs[1], outputs[2]};
+            //return {outputs[0]}
+    };
+
+    std::tuple<ttnn::Tensor, ttnn::Tensor, ttnn::Tensor> NLPCreateHeadsSegformerOperation::invoke (
+        const Tensor& input_tensor_q,
+        const std::optional<MemoryConfig>& memory_config,
+        std::optional<std::vector<std::optional<Tensor>>> optional_output_tensors) {
+        return invoke(ttnn::DefaultQueueId, input_tensor_q, memory_config, optional_output_tensors);
+    };
+
+}  // namespace ttnn::operations::experimental::transformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp
new file mode 100644
index 00000000000..4ac1ec051ec
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "device/nlp_create_qkv_heads_segformer_device_operation.hpp"
+#include "ttnn/run_operation.hpp"
+#include "ttnn/operations/core/core.hpp"
+
+namespace ttnn {
+namespace operations::experimental::transformer {
+
+struct NLPCreateHeadsSegformerOperation {
+    static std::tuple<ttnn::Tensor, ttnn::Tensor, ttnn::Tensor> invoke (
+        uint8_t queue_id,
+        const Tensor& input_tensor_q,
+        const std::optional<MemoryConfig>& memory_config,
+        std::optional<std::vector<std::optional<Tensor>>> optional_output_tensors = std::nullopt);
+
+    static std::tuple<ttnn::Tensor, ttnn::Tensor, ttnn::Tensor> invoke (
+        const Tensor& input_tensor_q,
+        const std::optional<MemoryConfig>& memory_config,
+        std::optional<std::vector<std::optional<ttnn::Tensor>>> optional_output_tensors = std::nullopt);
+};
+} // namespace operations::experimental::transformer
+
+
+namespace experimental {
+
+constexpr auto nlp_create_qkv_heads_segformer = ttnn::register_operation_with_auto_launch_op<
+    "ttnn::experimental::nlp_create_qkv_heads_segformer",
+    ttnn::operations::experimental::transformer::NLPCreateHeadsSegformerOperation>();
+
+}  // namespace experimental
+
+}  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp
new file mode 100644
index 00000000000..adf07afb76a
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttnn/cpp/pybind11/decorators.hpp"
+
+#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp"
+#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp"
+
+
+namespace ttnn::operations::experimental::transformer::detail {
+
+void bind_nlp_create_qkv_heads_segformer(pybind11::module& module) {
+    ttnn::bind_registered_operation(
+        module,
+        ttnn::experimental::nlp_create_qkv_heads_segformer,
+        R"doc(
+            Shuffles [B, 1, S, 2304] fused qkv matrix into 3 heads with shapes [B, 12, S, 64], [B, 12, S, 64], and [B, 12, S, 64].
+        )doc",
+        ttnn::pybind_overload_t{
+            [] (const decltype(ttnn::experimental::nlp_create_qkv_heads_segformer) &self,
+                const ttnn::Tensor& input_tensor_q,
+                const std::optional<ttnn::MemoryConfig>& memory_config,
+                std::optional<std::vector<std::optional<ttnn::Tensor>>>& optional_output_tensors,
+                uint8_t queue_id) {
+                    return self(queue_id, input_tensor_q, memory_config, optional_output_tensors);
+                },
+                pybind11::arg("input").noconvert(),
+                pybind11::kw_only(),
+                pybind11::arg("memory_config").noconvert() = std::nullopt,
+                pybind11::arg("output_tensors").noconvert() = std::nullopt,
+                pybind11::arg("queue_id") = 0});
+};
+} // namespace ttnn::operations::experimental::transformer::detail
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp
new file mode 100644
index 00000000000..5d528d396e5
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "pybind11/pybind_fwd.hpp"
+
+
+namespace ttnn::operations::experimental::transformer::detail {
+
+void bind_nlp_create_qkv_heads_segformer(pybind11::module& module);
+} // namespace ttnn::operations::experimental::transformer::detail
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp
new file mode 100644
index 00000000000..6405790b53b
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+
+void kernel_main() {
+    // READER RUNTIME ARGS
+    uint32_t in0_tensor_addr                     = get_arg_val<uint32_t>(0);
+    uint32_t in1_tensor_addr                     = get_arg_val<uint32_t>(1);
+    uint32_t num_blocks                          = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_tile_id                  = get_arg_val<uint32_t>(3);
+    uint32_t in1_tensor_tile_id                  = get_arg_val<uint32_t>(4);
+
+    // COMPILE TIME ARGS
+    // interleaved accessor args
+    constexpr uint32_t in0_is_dram               = get_compile_time_arg_val(0);
+    constexpr uint32_t in1_is_dram               = get_compile_time_arg_val(1);
+    // READER COMPILE TIME ARGS
+    constexpr uint32_t q_num_tiles               = get_compile_time_arg_val(2);
+    constexpr uint32_t kv_num_tiles              = get_compile_time_arg_val(3);
+
+
+    constexpr uint32_t cb_id_qv = 1; // cb for Q, V heads
+    #ifdef TRANSPOSE_K_HEADS
+    constexpr uint32_t cb_id_k = 0; // cb for K heads (used by compute)
+    #else
+    constexpr uint32_t cb_id_k = 1; // cb for K heads (directly to writer)
+    #endif
+
+    constexpr uint32_t onetile = 1;
+    const uint32_t single_tile_size_bytes = get_tile_size(cb_id_qv);
+    const DataFormat data_format = get_dataformat(cb_id_qv);
+
+    constexpr bool in0_is_dram_bool = in0_is_dram == 1;
+    const InterleavedAddrGenFast<in0_is_dram_bool> s0 = {
+        .bank_base_address = in0_tensor_addr,
+        .page_size = single_tile_size_bytes,
+        .data_format = data_format,
+    };
+
+    #ifdef READ_FROM_INPUT_TENSOR_KV
+    constexpr bool in1_is_dram_bool = in1_is_dram == 1;
+    const InterleavedAddrGenFast<in1_is_dram_bool> s1 = {
+        .bank_base_address = in1_tensor_addr,
+        .page_size = single_tile_size_bytes,
+        .data_format = data_format,
+    };
+    #endif
+
+
+    for (uint32_t block = 0; block < num_blocks; block++) {
+        // Q
+        for (uint32_t i = 0; i < q_num_tiles; i++) {
+            cb_reserve_back(cb_id_qv, onetile);
+            uint32_t l1_write_addr = get_write_ptr(cb_id_qv);
+            noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr);
+            noc_async_read_barrier();
+            cb_push_back(cb_id_qv, onetile);
+            in0_tensor_tile_id++;
+        }
+
+        // K
+        for (uint32_t i = 0; i < kv_num_tiles; i++) {
+            cb_reserve_back(cb_id_k, onetile);
+            uint32_t l1_write_addr = get_write_ptr(cb_id_k);
+            #ifdef READ_FROM_INPUT_TENSOR_KV
+            noc_async_read_tile(in1_tensor_tile_id, s1, l1_write_addr);
+            in1_tensor_tile_id++;
+            #else
+            noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr);
+            in0_tensor_tile_id++;
+            #endif
+            noc_async_read_barrier();
+            cb_push_back(cb_id_k, onetile);
+        }
+
+        // V
+        for (uint32_t i = 0; i < kv_num_tiles; i++) {
+            cb_reserve_back(cb_id_qv, onetile);
+            uint32_t l1_write_addr = get_write_ptr(cb_id_qv);
+            #ifdef READ_FROM_INPUT_TENSOR_KV
+            noc_async_read_tile(in1_tensor_tile_id, s1, l1_write_addr);
+            in1_tensor_tile_id++;
+            #else
+            noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr);
+            in0_tensor_tile_id++;
+            #endif
+            noc_async_read_barrier();
+            cb_push_back(cb_id_qv, onetile);
+        }
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp
new file mode 100644
index 00000000000..a8cd96bb20d
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp
@@ -0,0 +1,155 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include <array>
+#include "dataflow_api.h"
+
+
+void kernel_main() {
+    // WRITER RUNTIME ARGS
+    uint32_t q_tensor_addr                       = get_arg_val<uint32_t>(0);
+    uint32_t k_tensor_addr                       = get_arg_val<uint32_t>(1);
+    uint32_t v_tensor_addr                       = get_arg_val<uint32_t>(2);
+    uint32_t num_blocks                          = get_arg_val<uint32_t>(3);
+    uint32_t q_out_h_dim                         = get_arg_val<uint32_t>(4);
+    uint32_t q_out_tensor_tile_id                = get_arg_val<uint32_t>(5);
+    uint32_t k_out_tensor_tile_id                = get_arg_val<uint32_t>(6);
+    uint32_t v_out_tensor_tile_id                = get_arg_val<uint32_t>(7);
+
+    // COMPILE TIME ARGS
+    // interleaved accessor args
+    constexpr uint32_t out_is_dram               = get_compile_time_arg_val(0);
+    constexpr uint32_t q_out_h_tiles             = get_compile_time_arg_val(1);
+    constexpr uint32_t q_out_w_tiles             = get_compile_time_arg_val(2);
+    constexpr uint32_t q_out_HtWt                = get_compile_time_arg_val(3);
+    constexpr uint32_t q_out_c                   = get_compile_time_arg_val(4);
+    constexpr uint32_t kv_out_c                  = get_compile_time_arg_val(5);
+
+
+    constexpr uint32_t cb_id_qv = 1; // cb for Q, V heads tiles
+    #ifdef TRANSPOSE_K_HEADS
+    constexpr uint32_t cb_id_k = 16; // cb for K heads (filled by compute)
+    #else
+    constexpr uint32_t cb_id_k = 1; // cb for K heads (directly from reader)
+    #endif
+    const uint32_t single_tile_size_bytes = get_tile_size(cb_id_qv);
+    const DataFormat data_format = get_dataformat(cb_id_qv);
+
+    constexpr bool out_is_dram_bool = out_is_dram == 1;
+    const InterleavedAddrGenFast<out_is_dram_bool> sq = {
+        .bank_base_address = q_tensor_addr,
+        .page_size = single_tile_size_bytes,
+        .data_format = data_format
+    };
+    const InterleavedAddrGenFast<out_is_dram_bool> sk = {
+        .bank_base_address = k_tensor_addr,
+        .page_size = single_tile_size_bytes,
+        .data_format = data_format
+    };
+    const InterleavedAddrGenFast<out_is_dram_bool> sv = {
+        .bank_base_address = v_tensor_addr,
+        .page_size = single_tile_size_bytes,
+        .data_format = data_format
+    };
+
+    constexpr uint32_t block_size = 1; // micro-block size for read/write; nothing to do with num_blocks
+    // TODO: This might negatively impact perf
+    constexpr uint32_t out_num_tiles_read = block_size; // always read and pop by micro-block size for generality
+    uint32_t l1_read_addr;
+    uint32_t q_out_tensor_current_tile_id; // need this to update q_out_tensor_tile_id
+    uint32_t k_out_tensor_current_tile_id; // need this to update k_out_tensor_tile_id
+    uint32_t v_out_tensor_current_tile_id; // need this to update v_out_tensor_tile_id
+    uint32_t out_tensor_current_tile_id_along_c;
+
+    for (uint32_t block = 0; block < num_blocks; block++) {
+        // q + create q head --> outputs: [B, num_q_heads, S, head_dim]
+        out_tensor_current_tile_id_along_c = q_out_tensor_tile_id;
+        for (uint32_t c_dim = 0; c_dim < q_out_c; c_dim++) {
+            q_out_tensor_current_tile_id = out_tensor_current_tile_id_along_c;
+            for (uint32_t w_dim = 0; w_dim < q_out_w_tiles; w_dim++) {
+                cb_wait_front(cb_id_qv, out_num_tiles_read);
+                l1_read_addr = get_read_ptr(cb_id_qv);
+                noc_async_write_tile(q_out_tensor_current_tile_id, sq, l1_read_addr);
+
+                noc_async_write_barrier();
+                cb_pop_front(cb_id_qv, out_num_tiles_read);
+
+                q_out_tensor_current_tile_id++;
+            }
+            out_tensor_current_tile_id_along_c += q_out_HtWt;
+        }
+
+        // k + create k head --> outputs: [B, num_kv_heads, S, head_dim]
+        #ifndef TRANSPOSE_K_HEADS
+        out_tensor_current_tile_id_along_c = k_out_tensor_tile_id;
+        #else
+        k_out_tensor_current_tile_id = k_out_tensor_tile_id;
+        #endif
+        for (uint32_t c_dim = 0; c_dim < kv_out_c; c_dim++) {
+            #ifndef TRANSPOSE_K_HEADS
+            k_out_tensor_current_tile_id = out_tensor_current_tile_id_along_c;
+            #endif
+            for (uint32_t w_dim = 0; w_dim < q_out_w_tiles; w_dim++) {
+                cb_wait_front(cb_id_k, out_num_tiles_read);
+                l1_read_addr = get_read_ptr(cb_id_k);
+                noc_async_write_tile(k_out_tensor_current_tile_id, sk, l1_read_addr);
+
+                noc_async_write_barrier();
+                cb_pop_front(cb_id_k, out_num_tiles_read);
+
+                #ifndef TRANSPOSE_K_HEADS
+                k_out_tensor_current_tile_id++;
+                #else
+                k_out_tensor_current_tile_id += q_out_h_tiles;
+                #endif
+            }
+            #ifndef TRANSPOSE_K_HEADS
+            out_tensor_current_tile_id_along_c += q_out_HtWt;
+            #endif
+        }
+
+        // v + create v head --> outputs: [B, num_kv_heads, S, head_dim]
+        out_tensor_current_tile_id_along_c = v_out_tensor_tile_id;
+        for (uint32_t c_dim = 0; c_dim < kv_out_c; c_dim++) {
+            v_out_tensor_current_tile_id = out_tensor_current_tile_id_along_c;
+            for (uint32_t w_dim = 0; w_dim < q_out_w_tiles; w_dim++) {
+                cb_wait_front(cb_id_qv, out_num_tiles_read);
+                l1_read_addr = get_read_ptr(cb_id_qv);
+                noc_async_write_tile(v_out_tensor_current_tile_id, sv, l1_read_addr);
+
+                noc_async_write_barrier();
+                cb_pop_front(cb_id_qv, out_num_tiles_read);
+
+                v_out_tensor_current_tile_id++;
+            }
+            out_tensor_current_tile_id_along_c += q_out_HtWt;
+        }
+
+        // Update out_tensor_tile_id for next h_dim or batch if we finish one CHtWt
+        q_out_h_dim++;
+        if (q_out_h_dim < q_out_h_tiles) {
+            q_out_tensor_tile_id += q_out_w_tiles;
+            #ifndef TRANSPOSE_K_HEADS
+            k_out_tensor_tile_id += q_out_w_tiles;
+            #else
+            k_out_tensor_tile_id++;
+            #endif
+            v_out_tensor_tile_id += q_out_w_tiles;
+        } else {
+            // If we finish one batch, always roll over to next tile in memory
+            // This is just the current_tile_id, except for K when we transpose heads
+            // In this case, decrement k_out_tensor_current_tile_id by the stride (q_out_h_tiles) and add 1 to roll over
+            q_out_tensor_tile_id = q_out_tensor_current_tile_id;
+            #ifndef TRANSPOSE_K_HEADS
+            k_out_tensor_tile_id = k_out_tensor_current_tile_id;
+            #else
+            k_out_tensor_tile_id = ++k_out_tensor_current_tile_id - q_out_h_tiles; // inc by 1 and decrement by stride
+            #endif
+            v_out_tensor_tile_id = v_out_tensor_current_tile_id;
+            q_out_h_dim = 0;
+        }
+    }
+
+}
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
new file mode 100644
index 00000000000..f528f4bf966
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
@@ -0,0 +1,52 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "nlp_create_qkv_heads_vit_device_operation.hpp"
+
+#include "tt_metal/common/work_split.hpp"
+
+namespace ttnn::operations::experimental::transformer {
+
+// Hard-coded for Vit
+void NlpCreateHeadsVitDeviceOperation::validate(const std::vector<Tensor>& input_tensors) const {
+    const auto& input_tensor = input_tensors.at(0);
+    const auto input_shape = input_tensor.get_legacy_shape();
+
+    TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to TM need to be on device!");
+    TT_FATAL(input_tensor.buffer() != nullptr, "Operands to TM need to be allocated in buffers on device!");
+    TT_FATAL(input_tensor.get_dtype() == tt::tt_metal::DataType::FLOAT32 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT16 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT8_B, "Unsupported data format");
+    TT_FATAL(input_tensor.get_layout() == Layout::TILE, "Error");
+
+    TT_FATAL(input_shape[2] % tt::constants::TILE_HEIGHT == 0, "Error");
+    TT_FATAL((input_shape == tt::tt_metal::LegacyShape({input_shape[0], 1, input_shape[2], 2304})), "Unsupported input shape");
+    TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::INTERLEAVED, "Error");
+}
+
+std::vector<tt::tt_metal::LegacyShape> NlpCreateHeadsVitDeviceOperation::compute_output_shapes(const std::vector<Tensor>& input_tensors) const {
+    std::vector<tt::tt_metal::LegacyShape> output_shape_vec;
+    const auto& input_tensor = input_tensors.at(0);
+    const auto input_shape = input_tensor.get_legacy_shape();
+    output_shape_vec = {(tt::tt_metal::LegacyShape) {input_shape[0], 12, input_shape[2], 64}, (tt::tt_metal::LegacyShape) {input_shape[0], 12, input_shape[2], 64}, (tt::tt_metal::LegacyShape) {input_shape[0], 12, input_shape[2], 64}};
+    return output_shape_vec;
+}
+
+std::vector<Tensor> NlpCreateHeadsVitDeviceOperation::create_output_tensors(const std::vector<Tensor>& input_tensors) const {
+    const auto& input_tensor = input_tensors.at(0);
+    if (this->output_mem_config.is_sharded()) {
+        TT_ASSERT(false);
+        return {};
+    } else {
+        return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), Layout::TILE, this->output_mem_config);
+    }
+}
+
+operation::ProgramWithCallbacks NlpCreateHeadsVitDeviceOperation::create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const {
+    const auto& input_tensor = input_tensors.at(0);
+    auto& output_tensor = output_tensors.at(0);
+
+    CoreCoord compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size();
+
+    return multi_core_nlp_create_qkv_heads_vit(input_tensor, output_tensors, compute_with_storage_grid_size);
+}
+}  // namespace ttnn::operations::experimental::transformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp
new file mode 100644
index 00000000000..2891d6a27ec
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <optional>
+
+#include "ttnn/run_operation.hpp"
+#include <variant>
+
+#include "ttnn/common/constants.hpp"
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/device_operation.hpp"
+
+namespace ttnn::operations::experimental::transformer {
+
+operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_vit(const Tensor &input_tensor_a, std::vector<Tensor> &output, CoreCoord compute_with_storage_grid_size);
+
+struct NlpCreateHeadsVitDeviceOperation {
+    MemoryConfig output_mem_config;
+
+    void validate(const std::vector<Tensor>& input_tensors) const;
+    std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
+    std::vector<Tensor> create_output_tensors(const std::vector<Tensor>& input_tensors) const;
+    operation::ProgramWithCallbacks create_program(
+        const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const;
+};
+
+} // namespace ttnn::operations::experimental::transformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp
new file mode 100644
index 00000000000..1e91f35399e
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp
@@ -0,0 +1,249 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "nlp_create_qkv_heads_vit_device_operation.hpp"
+#include "tt_metal/common/work_split.hpp"
+
+namespace ttnn::operations::experimental::transformer {
+
+using namespace tt::constants;
+using namespace tt;
+using namespace tt::tt_metal;
+
+
+operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_vit(const Tensor &a, std::vector<Tensor>& output, CoreCoord compute_with_storage_grid_size) {
+
+    const auto& ashape = a.get_legacy_shape();
+
+    tt_metal::Device *device = a.device();
+
+    tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
+
+    uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format);
+    tt_metal::Buffer *in0_buffer = a.buffer();
+    TT_ASSERT(in0_buffer->size() % single_tile_size == 0);
+    // Dummy
+    tt_metal::Buffer *in1_buffer;
+    uint32_t in1_buffer_addr = 0;
+
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      TM Parameters Setup
+    ////////////////////////////////////////////////////////////////////////////
+    uint32_t per_tensor_tiles = ashape[3] / TILE_WIDTH ; // 72
+    const uint32_t q_num_tiles_per_tensor = 24;
+    const uint32_t kv_num_tiles_per_tensor = 24;
+    const uint32_t num_q_heads = 12;
+    const uint32_t num_kv_heads = 12;
+
+    // Per output tensor args
+    // Output shape for Q,K,V is: [B, 12, s, 64] # Needs shuffling from [B, 1, s, 2304]
+    uint32_t q_out_h_tiles = ashape[2] / TILE_HEIGHT;
+    uint32_t q_out_w_tiles = 2; // head_dim
+    uint32_t q_out_c = q_num_tiles_per_tensor / q_out_w_tiles; // num_heads
+    uint32_t q_out_HtWt = q_out_h_tiles * q_out_w_tiles;
+    uint32_t q_out_CHtWt = q_out_c * q_out_HtWt;
+    uint32_t kv_out_CHtWt = num_kv_heads * q_out_HtWt;
+    uint32_t q_num_tiles = num_q_heads * q_out_w_tiles;
+    uint32_t kv_num_tiles = num_kv_heads * q_out_w_tiles;
+
+    uint32_t num_cores_x = compute_with_storage_grid_size.x;
+    uint32_t num_cores_y = compute_with_storage_grid_size.y;
+    // Block is a unit of work; ie. num of per_tensor_tiles per core
+    uint32_t num_blocks = ashape[0] * ashape[1] * ashape[2] / TILE_HEIGHT;
+    auto [num_cores, all_cores, core_group_1, core_group_2, num_blocks_per_core_group_1, num_blocks_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_blocks);
+
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Grayskull Device Setup
+    ////////////////////////////////////////////////////////////////////////////
+    TT_ASSERT((output.size() == 3), "Output vector must be size 3 for split fused qkv!");
+    tt_metal::Tensor& q = output[0];
+    tt_metal::Tensor& k = output[1];
+    tt_metal::Tensor& v = output[2];
+
+    tt_metal::Buffer *q_buffer = q.buffer();
+    TT_ASSERT(q_buffer != nullptr, "Output q buffer should be allocated on device!");
+    tt_metal::Buffer *k_buffer = k.buffer();
+    TT_ASSERT(k_buffer != nullptr, "Output k buffer should be allocated on device!");
+    tt_metal::Buffer *v_buffer = v.buffer();
+    TT_ASSERT(v_buffer != nullptr, "Output v buffer should be allocated on device!");
+
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Application Setup
+    ////////////////////////////////////////////////////////////////////////////
+    tt_metal::Program program = tt_metal::CreateProgram();
+
+    bool tile_dtype_is_bfloat16 = a.get_dtype() == tt::tt_metal::DataType::BFLOAT16;
+    bool in0_is_dram = in0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
+    bool out_is_dram = q_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
+    bool in1_is_dram = false;
+
+    std::vector<uint32_t> reader_compile_time_args = {
+            // interleaved accessor args
+            (std::uint32_t) in0_is_dram,
+            (std::uint32_t) in1_is_dram,
+            (std::uint32_t) q_num_tiles,
+            (std::uint32_t) kv_num_tiles,
+    };
+    std::vector<uint32_t> writer_compile_time_args = {
+            // interleaved accessor args
+            (std::uint32_t) out_is_dram,
+            (std::uint32_t) q_out_h_tiles,
+            (std::uint32_t) q_out_w_tiles,
+            (std::uint32_t) q_out_HtWt,
+            (std::uint32_t) num_q_heads, // q_out_c
+            (std::uint32_t) num_kv_heads, // kv_out_c
+    };
+
+    ///////////// K transpose ////////////////////
+    const bool transpose_k_heads = false;
+    std::map<string, string> reader_defines;
+    std::map<string, string> writer_defines;
+    if (transpose_k_heads) {
+        std::vector<uint32_t> compute_args_core_group_1 = {num_blocks_per_core_group_1 * kv_num_tiles};
+        auto compute_kernel_id_group_1 = tt_metal::CreateKernel(
+            program,
+            "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp",
+            core_group_1,
+            tt_metal::ComputeConfig{.compile_args = compute_args_core_group_1}
+        );
+
+        if (core_group_2.num_cores() > 0) {
+            std::vector<uint32_t> compute_args_core_group_2 = {num_blocks_per_core_group_2 * kv_num_tiles};
+            auto compute_kernel_id_group_2 = tt_metal::CreateKernel(
+                program,
+                "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp",
+                core_group_2,
+                tt_metal::ComputeConfig{.compile_args = compute_args_core_group_2}
+            );
+        }
+        reader_defines["TRANSPOSE_K_HEADS"] = "1";
+        writer_defines["TRANSPOSE_K_HEADS"] = "1";
+    }
+    //////////////////////////////////////////////
+
+    auto reader_kernel_id = tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp",
+        all_cores,
+        tt_metal::ReaderDataMovementConfig(reader_compile_time_args, reader_defines));
+
+    auto writer_kernel_id = tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp",
+        all_cores,
+        tt_metal::WriterDataMovementConfig(writer_compile_time_args, writer_defines));
+
+
+
+    // Create circular buffers
+    uint32_t src1_cb_index = 1;
+    uint32_t cb0_num_tiles = per_tensor_tiles * 2; // double buffer
+    tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(cb0_num_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
+		.set_page_size(src1_cb_index, single_tile_size);
+    auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
+
+    // If we transpose_k_heads:
+    // - reader will write to cb0, instead of cb1
+    // - compute will wait on cb0 and write to cb16
+    // - writer will wait on cb 16, instead of cb1
+    if (transpose_k_heads) {
+        uint32_t src0_cb_index = 0;
+        uint32_t cb0_num_tiles = per_tensor_tiles * 2; // double buffer
+        tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(cb0_num_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
+		    .set_page_size(src0_cb_index, single_tile_size);
+        auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
+
+        uint32_t out_cb_index = 16;
+        uint32_t out_cb_num_tiles = per_tensor_tiles * 2; // double buffer
+        tt_metal::CircularBufferConfig cb_out_config = tt_metal::CircularBufferConfig(out_cb_num_tiles * single_tile_size, {{out_cb_index, cb_data_format}})
+            .set_page_size(out_cb_index, single_tile_size);
+        auto cb_out = tt_metal::CreateCircularBuffer(program, all_cores, cb_out_config);
+    }
+
+    for (uint32_t i = 0, num_blocks_written = 0; i < num_cores; i++){
+        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+        uint32_t num_blocks_per_core = 0;
+        if (core_group_1.contains(core)) {
+            num_blocks_per_core = num_blocks_per_core_group_1;
+        } else if (core_group_2.contains(core)) {
+            num_blocks_per_core = num_blocks_per_core_group_2;
+        } else {
+            TT_ASSERT(false, "Core not in specified core ranges");
+        }
+
+        std::vector<uint32_t> reader_runtime_args = {
+            (std::uint32_t) in0_buffer->address(),
+            (std::uint32_t) in1_buffer_addr,
+            num_blocks_per_core,
+            num_blocks_written * per_tensor_tiles,
+            0,
+        };
+
+        uint32_t q_out_h_dim = num_blocks_written % q_out_h_tiles;
+        uint32_t q_out_tensor_tile_id = num_blocks_written / q_out_h_tiles * q_out_CHtWt + q_out_h_dim * q_out_w_tiles;
+        uint32_t v_out_tensor_tile_id = num_blocks_written / q_out_h_tiles * kv_out_CHtWt + q_out_h_dim * q_out_w_tiles;
+        uint32_t k_out_tensor_tile_id = transpose_k_heads ? num_blocks_written / q_out_h_tiles * kv_out_CHtWt + q_out_h_dim : v_out_tensor_tile_id;
+
+        std::vector<uint32_t> writer_runtime_args = {
+            (std::uint32_t) q_buffer->address(), // q_tensor_addr
+            (std::uint32_t) k_buffer->address(), // k_tensor_addr
+            (std::uint32_t) v_buffer->address(), // v_tensor_addr
+            num_blocks_per_core, // num_blocks
+            q_out_h_dim,
+            q_out_tensor_tile_id,
+            k_out_tensor_tile_id,
+            v_out_tensor_tile_id,
+        };
+
+        tt_metal::SetRuntimeArgs(program, reader_kernel_id, core, reader_runtime_args);
+        tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args);
+        num_blocks_written += num_blocks_per_core;
+    }
+
+    auto override_runtime_args_callback = [
+            reader_kernel_id,
+            writer_kernel_id,
+            num_cores,
+            num_cores_y
+        ]
+    (
+        const Program &program,
+        const std::vector<tt::tt_metal::Buffer*>& input_buffers,
+        const std::vector<tt::tt_metal::Buffer*>& output_buffers
+    ) {
+
+        auto src_dram_buffer = input_buffers.at(0);
+
+        auto dst_dram_buffer_query = output_buffers.at(0);
+        auto dst_dram_buffer_key = output_buffers.at(1);
+        auto dst_dram_buffer_value = output_buffers.at(2);
+
+        for (uint32_t i = 0, num_blocks_written = 0; i < num_cores; i++){
+            CoreCoord core = {i / num_cores_y, i % num_cores_y};
+
+            {
+                auto &runtime_args = GetRuntimeArgs(program, reader_kernel_id, core);
+                runtime_args[0] = src_dram_buffer->address();
+            }
+
+            {
+                auto &runtime_args = GetRuntimeArgs(program, writer_kernel_id, core);
+                runtime_args[0] = dst_dram_buffer_query->address();
+                runtime_args[1] = dst_dram_buffer_key->address();
+                runtime_args[2] = dst_dram_buffer_value->address();
+            }
+        }
+    };
+
+    return {std::move(program), override_runtime_args_callback};
+}
+
+
+}  // ttnn::operations::experimental::transformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp
new file mode 100644
index 00000000000..55afb0f3daf
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "nlp_create_qkv_heads_vit.hpp"
+
+namespace ttnn::operations::experimental::transformer {
+
+    std::tuple<ttnn::Tensor, ttnn::Tensor, ttnn::Tensor> NLPCreateHeadsVitOperation::invoke (
+        uint8_t queue_id,
+        const Tensor& input_tensor_q,
+        const std::optional<MemoryConfig>& memory_config,
+        std::optional<std::vector<std::optional<Tensor>>> optional_output_tensors) {
+            const MemoryConfig output_mem_config = memory_config.value_or(input_tensor_q.memory_config());
+            auto optional_outputs = std::vector<std::optional<Tensor>>{};
+            if (optional_output_tensors.has_value()) {
+                optional_outputs = {optional_output_tensors.value().begin(), optional_output_tensors.value().end()};
+            }
+            else {
+                optional_outputs = {};
+            }
+            auto outputs = operation::run(NlpCreateHeadsVitDeviceOperation{output_mem_config}, {input_tensor_q}, {}, optional_outputs);
+            return {outputs[0], outputs[1], outputs[2]};
+    };
+
+    std::tuple<ttnn::Tensor, ttnn::Tensor, ttnn::Tensor> NLPCreateHeadsVitOperation::invoke (
+        const Tensor& input_tensor_q,
+        const std::optional<MemoryConfig>& memory_config,
+        std::optional<std::vector<std::optional<Tensor>>> optional_output_tensors) {
+        return invoke(ttnn::DefaultQueueId, input_tensor_q, memory_config, optional_output_tensors);
+    };
+
+}  // namespace ttnn::operations::experimental::transformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp
new file mode 100644
index 00000000000..cb8ba8197e1
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "device/nlp_create_qkv_heads_vit_device_operation.hpp"
+#include "ttnn/run_operation.hpp"
+#include "ttnn/operations/core/core.hpp"
+
+namespace ttnn {
+namespace operations::experimental::transformer {
+
+struct NLPCreateHeadsVitOperation {
+    static std::tuple<ttnn::Tensor, ttnn::Tensor, ttnn::Tensor> invoke (
+        uint8_t queue_id,
+        const Tensor& input_tensor_q,
+        const std::optional<MemoryConfig>& memory_config,
+        std::optional<std::vector<std::optional<Tensor>>> optional_output_tensors = std::nullopt);
+
+    static std::tuple<ttnn::Tensor, ttnn::Tensor, ttnn::Tensor> invoke (
+        const Tensor& input_tensor_q,
+        const std::optional<MemoryConfig>& memory_config,
+        std::optional<std::vector<std::optional<ttnn::Tensor>>> optional_output_tensors = std::nullopt);
+};
+} // namespace operations::experimental::transformer
+
+
+namespace experimental {
+
+constexpr auto nlp_create_qkv_heads_vit = ttnn::register_operation_with_auto_launch_op<
+    "ttnn::experimental::nlp_create_qkv_heads_vit",
+    ttnn::operations::experimental::transformer::NLPCreateHeadsVitOperation>();
+
+}  // namespace experimental
+
+}  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp
new file mode 100644
index 00000000000..0fde58745b4
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttnn/cpp/pybind11/decorators.hpp"
+
+#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp"
+#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp"
+
+
+namespace ttnn::operations::experimental::transformer::detail {
+
+void bind_nlp_create_qkv_heads_vit(pybind11::module& module) {
+    ttnn::bind_registered_operation(
+        module,
+        ttnn::experimental::nlp_create_qkv_heads_vit,
+        R"doc(
+            Shuffles [B, 1, S, 2304] fused qkv matrix into 3 heads with shapes [B, 12, S, 64], [B, 12, S, 64], and [B, 12, S, 64].
+        )doc",
+        ttnn::pybind_overload_t{
+            [] (const decltype(ttnn::experimental::nlp_create_qkv_heads_vit) &self,
+                const ttnn::Tensor& input_tensor_q,
+                const std::optional<ttnn::MemoryConfig>& memory_config,
+                std::optional<std::vector<std::optional<ttnn::Tensor>>>& optional_output_tensors,
+                uint8_t queue_id) {
+                    return self(queue_id, input_tensor_q, memory_config, optional_output_tensors);
+                },
+                pybind11::arg("input").noconvert(),
+                pybind11::kw_only(),
+                pybind11::arg("memory_config").noconvert() = std::nullopt,
+                pybind11::arg("output_tensors").noconvert() = std::nullopt,
+                pybind11::arg("queue_id") = 0});
+};
+} // namespace ttnn::operations::experimental::transformer::detail
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp
new file mode 100644
index 00000000000..49f53dc5ca9
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "pybind11/pybind_fwd.hpp"
+
+
+namespace ttnn::operations::experimental::transformer::detail {
+
+void bind_nlp_create_qkv_heads_vit(pybind11::module& module);
+} // namespace ttnn::operations::experimental::transformer::detail
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp
index 93e36fee7f2..2e7144040f2 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp
@@ -21,7 +21,6 @@ namespace operations {
 namespace matmul {
 
 using ttnn::operations::unary::UnaryWithParam;
-using tt::tt_metal::LegacyShape;
 
 /*
  * GENERAL MATMUL AND BMM
@@ -170,8 +169,6 @@ struct Matmul {
         const std::vector<Tensor> &input_tensors,
         const std::vector<std::optional<const Tensor>> &optional_input_tensors) const;
     std::vector<ttnn::SimpleShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
-    std::vector<tt::tt_metal::LegacyShape> compute_output_shapes_dram_sharded(
-        const std::vector<Tensor> &input_tensors, uint32_t N_unpadded) const;
     std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
     operation::ProgramWithCallbacks create_program(
         const std::vector<Tensor> &input_tensors,
diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp
index c2d14cddedc..5bc05ffedb3 100644
--- a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp
@@ -18,7 +18,7 @@ namespace matmul {
 
 namespace detail {
 
-bool is_input_batched(const ttnn::Shape& shape) {
+bool is_input_batched(const ttnn::SimpleShape& shape) {
     auto is_batched = false;
     for (auto i = 0; i < shape.rank() - 2; ++i) {
         if (shape[i] > 1) {
@@ -109,7 +109,7 @@ Tensor MatmulOperation::invoke(
     if (core_grid.has_value()) {
         user_core_coord = CoreCoord(core_grid->x, core_grid->y);
     }
-    bool user_run_batched = detail::is_input_batched(input_tensor_b.get_shape());
+    bool user_run_batched = detail::is_input_batched(input_tensor_b.get_logical_shape());
     return bound_matmul(
         input_tensor_a,
         input_tensor_b,
@@ -147,7 +147,7 @@ Tensor LinearOperation::invoke(
     if (core_grid.has_value()) {
         user_core_coord = CoreCoord(core_grid->x, core_grid->y);
     }
-    bool b_is_batched = detail::is_input_batched(input_tensor_b.get_shape());
+    bool b_is_batched = detail::is_input_batched(input_tensor_b.get_logical_shape());
     TT_FATAL(!(b_is_batched && bias.has_value()), "Batched input not supported when bias exists (linear operation).");
 
     return bound_matmul(
diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
index eb450bd2896..1501b005471 100644
--- a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
+++ b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp
@@ -22,7 +22,7 @@ namespace matmul {
 
 namespace detail {
 
-bool is_input_batched(const ttnn::Shape& shape);
+bool is_input_batched(const ttnn::SimpleShape& logical_shape);
 
 }  // namespace detail
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp
index 0278c3da6c3..2fc0719ec97 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp
@@ -57,46 +57,39 @@ void Reduce::validate(const std::vector<Tensor>& input_tensors) const {
     }
 }
 
-std::vector<tt::tt_metal::LegacyShape> Reduce::compute_output_shapes(const std::vector<Tensor>& input_tensors) const {
+std::vector<ttnn::TensorSpec> Reduce::compute_output_specs(const std::vector<Tensor>& input_tensors) const {
     const auto& input_tensor = input_tensors.at(0);
 
-    auto output_shape = input_tensor.get_legacy_shape();
-    auto padding = output_shape.padding();
+    // TODO: Remove usage of input/output padded shape
+    // - Get output alignment from input alignment and output dtype, layout, mem_config
+    // - Get shard spec from output strides (logical shape + alignment)?
+    auto output_shape = input_tensor.get_logical_shape();
+    auto output_padded_shape = input_tensor.get_padded_shape();
     switch (this->dim) {
         case ReduceOpDim::H:
-            output_shape[2] = TILE_HEIGHT;
-            padding[2] = Padding::PadDimension{0, 31};
+            output_shape[2] = 1;
+            output_padded_shape[2] = TILE_HEIGHT;
             break;
         case ReduceOpDim::W:
-            output_shape[3] = TILE_WIDTH;
-            padding[3] = Padding::PadDimension{0, 31};
+            output_shape[3] = 1;
+            output_padded_shape[3] = TILE_WIDTH;
             break;
         case ReduceOpDim::HW:
-            output_shape[2] = TILE_HEIGHT;
-            output_shape[3] = TILE_WIDTH;
-            padding[2] = Padding::PadDimension{0, 31};
-            padding[3] = Padding::PadDimension{0, 31};
+            output_shape[2] = 1;
+            output_shape[3] = 1;
+            output_padded_shape[2] = TILE_HEIGHT;
+            output_padded_shape[3] = TILE_WIDTH;
             break;
     }
-    return {tt::tt_metal::LegacyShape(output_shape, padding)};
-}
 
-std::vector<Tensor> Reduce::create_output_tensors(const std::vector<Tensor>& input_tensors) const {
-    const auto& input_tensor = input_tensors.at(0);
-    if (this->output_mem_config.is_sharded()) {
-        auto output_shape = this->compute_output_shapes(input_tensors).at(0);
+    auto output_mem_config = this->output_mem_config;
+    if (output_mem_config.is_sharded()) {
         auto shard_spec = input_tensor.shard_spec().value(); // TODO: This will segfault if input is not sharded...
-        // TODO: For reduction along H, the shard height is always 1 padded up to 32 (tile height)
-        // Need to clean this up to have new layout account for sharding with padding
-        shard_spec.shape[0] = tt_metal::compute_volume(output_shape) / output_shape[-1];
-        auto mem_config = this->output_mem_config;
-        mem_config.shard_spec = shard_spec;
-        return {
-            create_device_tensor(output_shape, this->output_dtype, Layout::TILE, input_tensor.device(), mem_config)};
-    } else {
-        return operation::generic_create_output_tensors(
-            *this, input_tensors, this->output_dtype, Layout::TILE, this->output_mem_config);
+        shard_spec.shape[0] = output_padded_shape.volume() / output_padded_shape[-1];
+        output_mem_config.shard_spec = shard_spec;
     }
+
+    return {ttnn::TensorSpec(output_shape, TensorLayout::fromLegacyPaddedShape(this->output_dtype, PageConfig(Layout::TILE), output_mem_config, ttnn::Shape(output_shape.view(), output_padded_shape.view())))};
 }
 
 operation::ProgramWithCallbacks Reduce::create_program(
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp
index 2f7dfbdabd6..c53598910a1 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp
@@ -30,8 +30,7 @@ struct Reduce {
     ttnn::DeviceComputeKernelConfig compute_kernel_config;
 
     void validate(const std::vector<Tensor> &input_tensors) const;
-    std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
-    std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
+    std::vector<ttnn::TensorSpec> compute_output_specs(const std::vector<Tensor> &input_tensors) const;
     operation::ProgramWithCallbacks create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const;
     ReduceOpParallelizationStrategy get_parallelization_strategy(const std::vector<Tensor>& input_tensors) const;
 };
diff --git a/ttnn/cpp/ttnn/run_operation.cpp b/ttnn/cpp/ttnn/run_operation.cpp
index 40e0f3a44ab..3d03485a101 100644
--- a/ttnn/cpp/ttnn/run_operation.cpp
+++ b/ttnn/cpp/ttnn/run_operation.cpp
@@ -298,18 +298,31 @@ template OptionalTensors run_without_autoformat<OptionalTensors>(
     uint8_t cq_id);
 
 std::vector<LegacyShape> extract_legacy_shapes(
-    const std::variant<std::vector<tt::tt_metal::LegacyShape>, std::vector<ttnn::SimpleShape>>&& shapes, const std::function<TensorLayout(size_t idx)>& layout_provider) {
+    const std::variant<std::vector<tt::tt_metal::LegacyShape>, std::vector<ttnn::SimpleShape>, std::vector<ttnn::TensorSpec>>&& shapes, const std::function<TensorLayout(size_t idx)>& layout_provider, const bool use_tensor_layout_from_tensor_spec) {
     if (std::holds_alternative<std::vector<tt::tt_metal::LegacyShape>>(shapes)) {
         return std::get<std::vector<tt::tt_metal::LegacyShape>>(std::move(shapes));
+    } else if (std::holds_alternative<std::vector<ttnn::SimpleShape>>(shapes)) {
+        const auto& simple_shapes = std::get<std::vector<ttnn::SimpleShape>>(shapes);
+        std::vector<LegacyShape> legacy_shapes;
+        legacy_shapes.reserve(simple_shapes.size());
+        for (size_t idx = 0; idx < simple_shapes.size(); idx++) {
+            TensorLayout tensor_layout = layout_provider(idx);
+            legacy_shapes.emplace_back(simple_shapes[idx].view(), tensor_layout.compute_padded_shape(simple_shapes[idx]).view());
+        }
+        return legacy_shapes;
+    } else if (std::holds_alternative<std::vector<ttnn::TensorSpec>>(shapes)) {
+        const auto& tensor_specs = std::get<std::vector<ttnn::TensorSpec>>(shapes);
+        std::vector<LegacyShape> legacy_shapes;
+        legacy_shapes.reserve(tensor_specs.size());
+        for (size_t idx = 0; idx < tensor_specs.size(); idx++) {
+            const auto& [simple_shape, output_layout] = tensor_specs[idx];
+            TensorLayout tensor_layout = use_tensor_layout_from_tensor_spec ? output_layout : layout_provider(idx);
+            legacy_shapes.emplace_back(simple_shape.view(), tensor_layout.compute_padded_shape(simple_shape).view());
+        }
+        return legacy_shapes;
+    } else {
+        TT_THROW("extract_legacy_shapes only supports LegacyShape, SimpleShape, or TensorSpec");
     }
-    const auto& simple_shapes = std::get<std::vector<ttnn::SimpleShape>>(shapes);
-    std::vector<LegacyShape> legacy_shapes;
-    legacy_shapes.reserve(simple_shapes.size());
-    for (size_t idx = 0; idx < simple_shapes.size(); idx++) {
-        TensorLayout tensor_layout = layout_provider(idx);
-        legacy_shapes.emplace_back(simple_shapes[idx].view(), tensor_layout.compute_padded_shape(simple_shapes[idx]).view());
-    }
-    return legacy_shapes;
 }
 
 // To be deprecated/removed in favor of new implementation where ops specifically request how to format inputs/outputss
@@ -361,7 +374,7 @@ Tensors run_with_autoformat(
     auto output_shapes = extract_legacy_shapes(operation.compute_output_shapes(input_tensors), [&](size_t idx) {
         auto tensor = output_tensors[idx];
         return TensorLayout(tensor.get_dtype(), Layout::TILE, tensor.memory_config());
-    });
+    }, /*use_tensor_layout_from_tensor_spec=*/ true);
 
     TT_ASSERT(output_tensors.size() == output_shapes.size());
 
@@ -424,7 +437,7 @@ Tensors run_with_autoformat(
     auto output_shapes = extract_legacy_shapes(operation.compute_output_shapes(input_tensors), [&](size_t idx) {
         auto tensor = output_tensors[idx];
         return TensorLayout(tensor.get_dtype(), output_layouts[idx], tensor.memory_config());
-    });
+    }, /*use_tensor_layout_from_tensor_spec=*/ false);
 
     TT_ASSERT(output_tensors.size() == output_shapes.size());
     TT_ASSERT(output_tensors.size() == output_layouts.size());
diff --git a/ttnn/cpp/ttnn/run_operation.hpp b/ttnn/cpp/ttnn/run_operation.hpp
index 3286146dfd8..1305a7e4da5 100644
--- a/ttnn/cpp/ttnn/run_operation.hpp
+++ b/ttnn/cpp/ttnn/run_operation.hpp
@@ -17,6 +17,11 @@ namespace tt::tt_metal {
 namespace operation {
 
 using ttnn::operations::experimental::auto_format::FormatParams;
+
+// TODO: create_output_tensors should become a fully manual path with no dependency on infra
+// - Pass output shapes directly
+// - Move default values for output_dtype and output_mem_config inside ops
+// - This function becomes just a regular helper function
 template <typename ConcreteOperation>
 auto generic_create_output_tensors(
     const ConcreteOperation& operation,
diff --git a/ttnn/cpp/ttnn/tensor/tensor.hpp b/ttnn/cpp/ttnn/tensor/tensor.hpp
index 2aa903fad62..8291cba455d 100644
--- a/ttnn/cpp/ttnn/tensor/tensor.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor.hpp
@@ -368,5 +368,6 @@ bool validate_worker_modes(const std::vector<Device *> &workers);
 namespace ttnn {
 
 using Tensor = tt::tt_metal::Tensor;
+using TensorSpec = std::pair<ttnn::SimpleShape, tt::tt_metal::TensorLayout>;
 
 }  // namespace ttnn