diff --git a/.github/workflows/pipeline-select-t3k.yaml b/.github/workflows/pipeline-select-t3k.yaml index 3df726bdd6e..d602c81a73e 100644 --- a/.github/workflows/pipeline-select-t3k.yaml +++ b/.github/workflows/pipeline-select-t3k.yaml @@ -12,6 +12,10 @@ on: - RelWithDebInfo - CI default: "Release" + extra-tag: + required: true + type: string + default: "in-service" build-with-tracy: required: false type: boolean @@ -54,29 +58,41 @@ jobs: needs: build-artifact secrets: inherit uses: ./.github/workflows/t3000-unit-tests-impl.yaml + with: + extra-tag: ${{ inputs.extra-tag }} if: ${{ inputs.t3000-unit }} t3000-demo-tests: needs: build-artifact secrets: inherit uses: ./.github/workflows/t3000-demo-tests-impl.yaml + with: + extra-tag: ${{ inputs.extra-tag }} if: ${{ inputs.t3000-demo }} t3000-frequent-tests: needs: build-artifact secrets: inherit uses: ./.github/workflows/t3000-frequent-tests-impl.yaml + with: + extra-tag: ${{ inputs.extra-tag }} if: ${{ inputs.t3000-frequent }} t3000-nightly-tests: needs: build-artifact secrets: inherit uses: ./.github/workflows/t3000-nightly-tests-impl.yaml + with: + extra-tag: ${{ inputs.extra-tag }} if: ${{ inputs.t3000-nightly }} t3000-model-perf-tests: needs: build-artifact secrets: inherit uses: ./.github/workflows/t3000-model-perf-tests-impl.yaml + with: + extra-tag: ${{ inputs.extra-tag }} if: ${{ inputs.t3000-model-perf }} t3000-profiler-tests: needs: build-artifact secrets: inherit uses: ./.github/workflows/t3000-profiler-tests-impl.yaml + with: + extra-tag: ${{ inputs.extra-tag }} if: ${{ inputs.t3000-profiler }} diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml index 8c3d98815a3..48f0467d2cb 100644 --- a/.github/workflows/t3000-demo-tests-impl.yaml +++ b/.github/workflows/t3000-demo-tests-impl.yaml @@ -2,6 +2,11 @@ name: "[internal] T3000 demo tests impl" on: workflow_call: + inputs: + extra-tag: + required: false + type: string + default: "in-service" jobs: t3000-demo-tests: @@ -23,7 +28,11 @@ jobs: ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"] + runs-on: + - arch-wormhole_b0 + - config-t3000 + - pipeline-perf + - ${{ inputs.extra-tag }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Enable performance mode diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml index f770b1db457..67036141b0e 100644 --- a/.github/workflows/t3000-frequent-tests-impl.yaml +++ b/.github/workflows/t3000-frequent-tests-impl.yaml @@ -2,6 +2,11 @@ name: "[internal] T3000 frequent tests impl" on: workflow_call: + inputs: + extra-tag: + required: false + type: string + default: "in-service" jobs: t3000-frequent-tests: @@ -27,7 +32,11 @@ jobs: ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-functional"] + runs-on: + - arch-wormhole_b0 + - config-t3000 + - pipeline-functional + - ${{ inputs.extra-tag }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - uses: ./.github/actions/ensure-active-weka-mount diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml index 1787bd5c2e9..91e208c214b 100644 --- a/.github/workflows/t3000-model-perf-tests-impl.yaml +++ b/.github/workflows/t3000-model-perf-tests-impl.yaml @@ -2,6 +2,11 @@ name: "[internal] T3000 model perf tests impl" on: workflow_call: + inputs: + extra-tag: + required: false + type: string + default: "in-service" jobs: @@ -25,7 +30,11 @@ jobs: ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"] + runs-on: + - arch-wormhole_b0 + - config-t3000 + - pipeline-perf + - ${{ inputs.extra-tag }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Enable performance mode diff --git a/.github/workflows/t3000-nightly-tests-impl.yaml b/.github/workflows/t3000-nightly-tests-impl.yaml index 009e6549d19..2255a5b4f6d 100644 --- a/.github/workflows/t3000-nightly-tests-impl.yaml +++ b/.github/workflows/t3000-nightly-tests-impl.yaml @@ -2,6 +2,11 @@ name: "[internal] T3000 nightly tests impl" on: workflow_call: + inputs: + extra-tag: + required: false + type: string + default: "in-service" jobs: t3000-nightly-tests: @@ -18,7 +23,11 @@ jobs: ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-functional"] + runs-on: + - arch-wormhole_b0 + - config-t3000 + - pipeline-functional + - ${{ inputs.extra-tag }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Set up dynamic env vars for build diff --git a/.github/workflows/t3000-profiler-tests-impl.yaml b/.github/workflows/t3000-profiler-tests-impl.yaml index ebd4940bb68..a703ed83b1d 100644 --- a/.github/workflows/t3000-profiler-tests-impl.yaml +++ b/.github/workflows/t3000-profiler-tests-impl.yaml @@ -2,6 +2,11 @@ name: "[internal] T3000 profiler tests impl" on: workflow_call: + inputs: + extra-tag: + required: false + type: string + default: "in-service" jobs: t3000-profiler-tests: @@ -12,7 +17,7 @@ jobs: { name: "T3000 profiler tests", arch: wormhole_b0, - runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"], + runs-on: ["arch-wormhole_b0", "config-t3000", "pipeline-perf", "${{ inputs.extra-tag }}"], cmd: './tests/scripts/run_profiler_regressions.sh' }, ] diff --git a/.github/workflows/t3000-unit-tests-impl.yaml b/.github/workflows/t3000-unit-tests-impl.yaml index 0fce1a2b5e5..66e32904a0c 100644 --- a/.github/workflows/t3000-unit-tests-impl.yaml +++ b/.github/workflows/t3000-unit-tests-impl.yaml @@ -2,6 +2,11 @@ name: "[internal] T3000 unit tests impl" on: workflow_call: + inputs: + extra-tag: + required: false + type: string + default: "in-service" jobs: t3000-unit-tests: @@ -28,7 +33,11 @@ jobs: ARCH_NAME: ${{ matrix.test-group.arch }} LOGURU_LEVEL: INFO LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib - runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-functional"] + runs-on: + - arch-wormhole_b0 + - config-t3000 + - pipeline-functional + - ${{ inputs.extra-tag }} steps: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0 - name: Set up dynamic env vars for build diff --git a/CMakeLists.txt b/CMakeLists.txt index b03b9363675..bbac64a095b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -263,7 +263,6 @@ if("$ENV{ARCH_NAME}" STREQUAL "wormhole_b0") tt_metal/hw/inc/wormhole ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/wormhole/wormhole_b0_defines ${UMD_HOME}/device/wormhole - ${UMD_HOME}/src/firmware/riscv/wormhole ) else() target_include_directories( @@ -271,7 +270,6 @@ else() INTERFACE ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/$ENV{ARCH_NAME} ${UMD_HOME}/device/$ENV{ARCH_NAME} - ${UMD_HOME}/src/firmware/riscv/$ENV{ARCH_NAME} ) endif() diff --git a/models/demos/llama3/lt b/models/demos/llama3/lt index c69b4113a84..70beb4e6e98 100755 --- a/models/demos/llama3/lt +++ b/models/demos/llama3/lt @@ -14,6 +14,7 @@ import re import time import signal import psutil +import json def ensure_less_installed(): @@ -52,6 +53,204 @@ def ensure_ttsmi_installed(): sys.exit(1) +class OutputEntryList: + def __init__(self): + self._entries = [] + # Create logs directory + os.makedirs("logs", exist_ok=True) + # Load existing state + self._load_state() + + def _load_state(self): + try: + with open("logs/state.json", "r") as f: + state = json.load(f) + for entry_data in state: + entry = Entry( + entry_data["command_name"], + entry_data["model"], + entry_data["device"], + entry_data["command_input"], + ) + # Restore saved attributes + entry.status = ( + "Cancelled" + if entry_data["status"] + in [ + "Running", + "Resetting", + "Initializing device", + "Starting", + "Prefill", + "Decode", + "Terminating", + "Exiting", + ] + else entry_data["status"] + ) + entry.output = entry_data["output"] + entry.log_id = entry_data["log_id"] + entry.speed = entry_data["speed"] + entry.pcc = entry_data["pcc"] + self._entries.append(entry) + except (FileNotFoundError, json.JSONDecodeError): + pass + + def save_state(self): + state = [] + for entry in self._entries: + entry_data = { + "command_name": entry.command_name, + "model": entry.model, + "device": entry.device, + "command_input": entry.command_input, + "status": entry.status, + "output": entry.output, + "log_id": entry.log_id, + } + if hasattr(entry, "speed"): + entry_data["speed"] = entry.speed + if hasattr(entry, "pcc"): + entry_data["pcc"] = entry.pcc + state.append(entry_data) + + with open("logs/state.json", "w") as f: + json.dump(state, f, indent=2) + + def __len__(self): + return len(self._entries) + + def __getitem__(self, index): + return self._entries[index] + + def __iter__(self): + return iter(self._entries) + + def append(self, entry): + if entry.log_id is None: + entry.log_id = self.next_log_id() + + # Remove any existing logs with same ID + for existing_file in os.listdir("logs"): + if existing_file.startswith(entry.log_prefix): + os.remove(os.path.join("logs", existing_file)) + + # Set parent list reference before appending + entry.set_parent_list(self) + self._entries.append(entry) + self.save_state() + + def pop(self, index): + result = self._entries.pop(index) + + try: + os.remove(result.get_log_filename()) + except (OSError, FileNotFoundError): + pass + + # Mark all subsequent entries as changed + for entry in self._entries[index:]: + entry.mark_changed() + + self.save_state() + return result + + def index(self, entry): + return self._entries.index(entry) + + def get_entries(self): + return self._entries + + def next_log_id(self): + # Fix potential issue with list comprehension on dictionary access + max_id = 0 + for entry in self._entries: + if entry.log_id > max_id: + max_id = entry.log_id + return max_id + 1 + + +class Entry: + def __init__(self, command_name, model, device, command_input): + self.command_name = command_name + self.model = model + self.device = device.upper() + self.command_input = command_input + self.status = "Waiting" + self.output = "" + self.process = None + self.log_file = None + self.stop_event = threading.Event() + self.lock = threading.Lock() + self.log_id = None # Will be set by OutputEntryList + self.speed = None + self.pcc = None + self.thread = None + self.changed = True # Initialize as changed to ensure first draw + self._parent_list = None # Reference to parent OutputEntryList + + @property + def log_prefix(self): + """Generate the log file prefix based on the entry's log ID""" + return f"{self.log_id:04d}-" + + def mark_changed(self): + self.changed = True + + def mark_drawn(self): + self.changed = False + + def __setattr__(self, name, value): + super().__setattr__(name, value) + # Mark as changed whenever any attribute is modified + # (except for 'changed' itself to avoid recursion) + if name != "changed" and hasattr(self, "changed"): + self.changed = True + # Save state if we have a parent list and this isn't an unpersisted attribute + if ( + hasattr(self, "_parent_list") + and self._parent_list + and name not in ["process", "log_file", "stop_event", "lock", "thread"] + ): + self._parent_list.save_state() + + def __getitem__(self, key): + # Support dictionary-style access for backward compatibility + return getattr(self, key) + + def __setitem__(self, key, value): + # Support dictionary-style assignment for backward compatibility + setattr(self, key, value) + + def get(self, key, default=None): + # Support dictionary-style get() for backward compatibility + return getattr(self, key, default) + + def get_log_filename(self): + """Generate log filename based on entry properties""" + command_name = self._get_command_name() + filename = f"{self.log_prefix}{self.device}-{self.model}-{command_name}.log" + return os.path.join("logs", filename.replace("/", "_")) + + def _get_command_name(self): + """Extract command name from command input""" + if "pytest" in self.command_input: + match = re.search(r"pytest\s+([\S]+)", self.command_input) + if match: + test_file = match.group(1) + return os.path.basename(test_file).split(".")[0] + return "pytest" + return os.path.basename(shlex.split(self.command_input)[0]) + + def open_log_file(self): + """Open and return log file for writing""" + self.log_file = open(self.get_log_filename(), "w") + return self.log_file + + def set_parent_list(self, parent_list): + self._parent_list = parent_list + + def main(stdscr): curses.curs_set(0) # Hide cursor curses.start_color() @@ -69,9 +268,8 @@ def main(stdscr): {"label": "Device (n150, n300, t3k) [all]", "value": "", "x": 0, "y": 2}, ] - output_entries = [] + output_entries = OutputEntryList() current_line = 0 # Index of the current line (input fields + output entries) - total_lines = len(input_fields) screen_lock = threading.Lock() screen_needs_update = threading.Event() # New event to signal screen updates @@ -201,26 +399,14 @@ def main(stdscr): # Create output entries for command, model, device in combinations: command_name = get_command_name(command) - entry = { - "command_name": command_name, - "model": model, - "device": device.upper(), - "status": "Waiting", - "output": "", - "process": None, - "log_file": None, - "index": len(output_entries), - "stop_event": threading.Event(), - "lock": threading.Lock(), - "command_input": command, # Save the individual command - } + entry = Entry(command_name, model, device, command) output_entries.append(entry) - # Update total_lines - total_lines = len(input_fields) + len(output_entries) + current_line = 0 screen_needs_update.set() else: # Otherwise if not the last field, move to next field + total_lines = len(input_fields) + len(output_entries) current_line = (current_line + 1) % total_lines screen_needs_update.set() else: @@ -228,13 +414,13 @@ def main(stdscr): entry_index = current_line - len(input_fields) if entry_index < len(output_entries): entry = output_entries[entry_index] - if entry["log_file"]: + if os.path.exists(entry.get_log_filename()): # Save current terminal state curses.def_prog_mode() # Exit curses temporarily curses.endwin() # Run less command - os.system(f"less -R {entry['log_file'].name}") + os.system(f"less -R {entry.get_log_filename()}") # Resume curses curses.reset_prog_mode() stdscr.refresh() @@ -253,20 +439,26 @@ def main(stdscr): entry_index = current_line - len(input_fields) if entry_index < len(output_entries): entry = output_entries[entry_index] - with entry["lock"]: - if entry["process"] and entry["process"].poll() is None: - # Cancel the running process - entry["stop_event"].set() - terminate_process_tree(entry["process"].pid) - entry["status"] = "Terminating" - elif entry["status"] != "Resetting": - # Remove the entry if it's already cancelled - output_entries.pop(entry_index) - total_lines -= 1 - if current_line >= total_lines: - current_line = total_lines - 1 + if cancel_entry(entry): + output_entries.pop(entry_index) + total_lines = len(input_fields) + len(output_entries) + if current_line >= total_lines: + current_line = total_lines - 1 screen_needs_update.set() + elif c == ord("X") and current_line >= len(input_fields): # Shift-X to clear all entries + to_remove = [] + for entry in output_entries: + if cancel_entry(entry): + to_remove.append(entry) + for entry in to_remove: + entry_index = output_entries.index(entry) + output_entries.pop(entry_index) + screen_needs_update.set() + total_lines = len(input_fields) + len(output_entries) + if current_line >= total_lines: + current_line = total_lines - 1 elif c == 9: # Tab key + total_lines = len(input_fields) + len(output_entries) current_line = (current_line + 1) % total_lines screen_needs_update.set() elif c == ord("r") and current_line >= len(input_fields): @@ -278,10 +470,8 @@ def main(stdscr): # Reset the entry to "Waiting" status entry["status"] = "Waiting" entry["output"] = "" - if "speed" in entry: - del entry["speed"] - if "pcc" in entry: - del entry["pcc"] + entry["speed"] = None + entry["pcc"] = None entry["process"] = None entry["log_file"] = None entry["stop_event"].clear() @@ -385,28 +575,24 @@ def draw_changes(stdscr, input_fields, output_entries, current_line, last_drawn_ output_start_y = header_y + 1 for idx, entry in enumerate(output_entries): y = output_start_y + idx - if y >= max_y - 3: # Change this line from max_y - 2 to max_y - 3 + if y >= max_y - 3: break - if ( - idx >= len(last_drawn_state["output_entries"]) - or entry != last_drawn_state["output_entries"][idx] - or current_line != last_drawn_state["current_line"] - ): + + # Only draw if entry has changed or selection state changed + if entry.changed or current_line != last_drawn_state["current_line"]: draw_output_entry(stdscr, entry, y, current_line == len(input_fields) + idx, max_x) + entry.mark_drawn() # Mark as drawn after updating # Clear any extra lines if output entries were removed for y in range( output_start_y + len(output_entries), - min( - output_start_y + len(last_drawn_state["output_entries"]), max_y - 3 - ), # Change this line from max_y - 2 to max_y - 3 + min(output_start_y + len(last_drawn_state["output_entries"]), max_y - 3), ): stdscr.move(y, 0) stdscr.clrtoeol() - # Update last_drawn_state - last_drawn_state["output_entries"] = [entry.copy() for entry in output_entries] last_drawn_state["current_line"] = current_line + last_drawn_state["output_entries"] = [{"log_id": entry.log_id} for entry in output_entries] def draw_input_field(stdscr, field, is_selected, max_x): @@ -423,13 +609,13 @@ def draw_input_field(stdscr, field, is_selected, max_x): def draw_output_entry(stdscr, entry, y, is_selected, max_x): cols = [ - entry["command_name"], - entry["model"], - entry["device"], - entry["status"], - entry.get("speed", ""), - entry.get("pcc", ""), - entry["output"], + entry.command_name, + entry.model, + entry.device, + entry.status, + entry.speed if entry.speed else "", + entry.pcc if entry.pcc else "", + entry.output, ] col_widths = [20, 10, 10, 20, 10, 10, max_x - 85] # Adjusted widths to accommodate the PCC column @@ -441,7 +627,7 @@ def draw_output_entry(stdscr, entry, y, is_selected, max_x): else: color = curses.color_pair(0) if i == 3: # Status column - status = entry["status"] + status = entry.status if status == "Waiting" or status == "Cancelled": color = COLOR_PAIR_WAITING elif status in ["Running", "Initializing device", "Prefill", "Decode", "Starting"] or status.startswith( @@ -527,10 +713,13 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update): env["FAKE_DEVICE"] = entry["device"] env["LLAMA_DIR"] = get_llama_dir(entry["model"]) + # Open log file + entry.open_log_file() + # Define command shortcuts command_shortcuts = { "demo": "pytest models/demos/llama3/demo/demo.py -k instruct_weights-1", - "demo_1layer": "pytest models/demos/llama3/demo/demo.py -k single_layer", + "demo-1layer": "pytest models/demos/llama3/demo/demo.py -k single_layer", "attention": "pytest models/demos/llama3/tests/test_llama_attention.py", "attention-prefill": "pytest models/demos/llama3/tests/test_llama_attention_prefill.py", "mlp": "pytest models/demos/llama3/tests/test_llama_mlp.py", @@ -575,11 +764,6 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update): # Prepare the command cmd_list = shlex.split(command_input) - # Open log file - log_filename = get_log_filename(entry["device"], entry["model"], command_input) - os.makedirs("logs", exist_ok=True) - entry["log_file"] = open(os.path.join("logs", log_filename), "w") - # If the command is invalid, write the output to the log file and return before trying to run the bad command if entry["status"] == "Error": entry["log_file"].write(entry["output"] + "\n") @@ -600,31 +784,28 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update): def process_output(entry, screen_lock, output_entries, screen_needs_update): - process = entry["process"] - log_file = entry["log_file"] + process = entry.process + log_file = entry.log_file previous_line = "" try: for line in iter(process.stdout.readline, ""): - if entry["stop_event"].is_set(): - break # Write to log file log_file.write(line) log_file.flush() # Update status and output based on output - status, output, speed, pcc = parse_output_line(line, previous_line, entry["status"]) + status, output, speed, pcc = parse_output_line(line, previous_line, entry.status) previous_line = line.strip() - with entry["lock"]: - if status != entry["status"] or output or speed is not None or pcc is not None: - entry["status"] = status + with entry.lock: + if status != entry.status or output or speed is not None or pcc is not None: + entry.status = status # This will mark entry as changed via __setattr__ if output: - entry["output"] = output + entry.output = output if speed is not None: - entry["speed"] = f"{speed:.1f}" + entry.speed = f"{speed:.1f}" if pcc is not None: - current_pcc = entry.get("pcc") - if current_pcc is None or float(pcc) < float(current_pcc): - entry["pcc"] = pcc + if entry.pcc is None or float(pcc) < float(entry.pcc): + entry.pcc = pcc screen_needs_update.set() with screen_lock: @@ -637,17 +818,19 @@ def process_output(entry, screen_lock, output_entries, screen_needs_update): # Wait for the process to fully terminate process.wait() - with entry["lock"]: + with entry.lock: if process.returncode != 0: - exception_name = find_exception_in_log(entry["log_file"].name) - entry["status"] = "Error" - if exception_name: - entry["output"] = exception_name + if entry.stop_event.is_set(): + entry.status = "Cancelled" + else: + exception_name = find_exception_in_log(entry.log_file.name) + entry.status = "Error" + if exception_name: + entry.output = exception_name reset_device_async(entry, screen_lock, screen_needs_update) - screen_needs_update.set() else: - entry["status"] = "Finished" - entry["process"] = None + entry.status = "Finished" + entry.process = None log_file.close() screen_needs_update.set() # Ensure screen is updated after process termination @@ -746,13 +929,6 @@ def get_command_name(command_input): return command_name -def get_log_filename(device, model, command_input): - command_name = get_command_name(command_input) - filename = f"{device}-{model}-{command_name}.log" - filename = filename.replace("/", "_") - return filename - - def find_exception_in_log(log_filename): exception_name = None with open(log_filename, "r") as f: @@ -807,12 +983,12 @@ def reset_device_async(entry, screen_lock, screen_needs_update): except subprocess.CalledProcessError as e: pass finally: - with entry["lock"]: - entry["status"] = previous_status + with entry.lock: + entry.status = previous_status screen_needs_update.set() - previous_status = entry["status"] - entry["status"] = "Resetting" + previous_status = entry.status + entry.status = "Resetting" reset_thread = threading.Thread(target=reset_thread) reset_thread.daemon = True reset_thread.start() @@ -827,13 +1003,28 @@ def draw_help_bar(stdscr, current_line, num_input_fields, num_output_entries): def get_help_text(current_line, num_input_fields, num_output_entries): if current_line == 0: - return "Shortcuts: demo, attention, mlp, decoder, decoder-prefill, model, model-prefill, model-quick | Enter: Submit | ↑↓: Navigate fields | Esc: Exit" + return "Shortcuts: demo, demo-1layer, attention, mlp, rmsnorm, decoder, model, model-quick, 'help' for full list | Enter: Submit | ↑↓: Navigate fields | Esc: Exit" elif current_line <= num_input_fields - 1: return "Enter: Next field | ↑↓: Navigate fields | Esc: Exit" else: - return ( - "Enter: View log | Backspace/x: Cancel/remove entry | r: Restart entry | ↑↓: Navigate entries | Esc: Exit" - ) + return "Enter: View log | Backspace/x: Cancel entry | X: Cancel all | r: Restart entry | ↑↓: Navigate entries | Esc: Exit" + + +def cancel_entry(entry): + """Handle removal of a single entry, returning True if entry was removed""" + with entry["lock"]: + if entry["process"] and entry["process"].poll() is None: + # Cancel the running process + entry["stop_event"].set() + terminate_process_tree(entry["process"].pid) + entry["status"] = "Terminating" + # Entry is still running, so don't remove it + return False + elif entry["status"] != "Resetting": + # Safe to remove the entry if it's already cancelled + return True + # Entry is running/resetting, so don't remove it + return False if __name__ == "__main__": diff --git a/models/demos/segformer/README.md b/models/demos/segformer/README.md new file mode 100644 index 00000000000..796a7fcdfd2 --- /dev/null +++ b/models/demos/segformer/README.md @@ -0,0 +1,20 @@ +# Segformer Demo + +## How to run demo + +- Use the following command to run the Segformer Encoder model (Classification): + ```python + pytest tests/ttnn/integration_tests/segformer/test_segformer_encoder.py + ``` + + +- Use the following command to run the Segformer Decoder module model: + ```python + pytest tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py + ``` + + +- Use the following command to run the Segformer full model (Segmentation): + ```python + pytest tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py + ``` diff --git a/models/experimental/functional_segformer/reference/segformer_attention.py b/models/demos/segformer/reference/segformer_attention.py similarity index 82% rename from models/experimental/functional_segformer/reference/segformer_attention.py rename to models/demos/segformer/reference/segformer_attention.py index c87af0938ab..b3ddfcdad3b 100644 --- a/models/experimental/functional_segformer/reference/segformer_attention.py +++ b/models/demos/segformer/reference/segformer_attention.py @@ -3,10 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 import torch.nn as nn -from models.experimental.functional_segformer.reference.segformer_efficient_selfattention import ( +from models.demos.segformer.reference.segformer_efficient_selfattention import ( SegformerEfficientSelfAttention, ) -from models.experimental.functional_segformer.reference.segformer_selfoutput import SegformerSelfOutput +from models.demos.segformer.reference.segformer_selfoutput import SegformerSelfOutput class SegformerAttention(nn.Module): diff --git a/models/experimental/functional_segformer/reference/segformer_decode_head.py b/models/demos/segformer/reference/segformer_decode_head.py similarity index 97% rename from models/experimental/functional_segformer/reference/segformer_decode_head.py rename to models/demos/segformer/reference/segformer_decode_head.py index c0d4c15b35d..c8f97106fd1 100644 --- a/models/experimental/functional_segformer/reference/segformer_decode_head.py +++ b/models/demos/segformer/reference/segformer_decode_head.py @@ -5,7 +5,7 @@ import torch from torch import nn import math -from models.experimental.functional_segformer.reference.segformer_mlp import SegformerMLP +from models.demos.segformer.reference.segformer_mlp import SegformerMLP from transformers.models.segformer.modeling_segformer import SegformerPreTrainedModel @@ -50,6 +50,7 @@ def forward(self, encoder_hidden_states: torch.FloatTensor) -> torch.Tensor: encoder_hidden_state = mlp(encoder_hidden_state) encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1) encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width) + # upsample encoder_hidden_state = nn.functional.interpolate( encoder_hidden_state, size=encoder_hidden_states[0].size()[2:], mode="bilinear", align_corners=False diff --git a/models/experimental/functional_segformer/reference/segformer_dwconv.py b/models/demos/segformer/reference/segformer_dwconv.py similarity index 100% rename from models/experimental/functional_segformer/reference/segformer_dwconv.py rename to models/demos/segformer/reference/segformer_dwconv.py diff --git a/models/experimental/functional_segformer/reference/segformer_efficient_selfattention.py b/models/demos/segformer/reference/segformer_efficient_selfattention.py similarity index 100% rename from models/experimental/functional_segformer/reference/segformer_efficient_selfattention.py rename to models/demos/segformer/reference/segformer_efficient_selfattention.py diff --git a/models/experimental/functional_segformer/reference/segformer_encoder.py b/models/demos/segformer/reference/segformer_encoder.py similarity index 95% rename from models/experimental/functional_segformer/reference/segformer_encoder.py rename to models/demos/segformer/reference/segformer_encoder.py index dc07cdebf7a..659b5e14f79 100644 --- a/models/experimental/functional_segformer/reference/segformer_encoder.py +++ b/models/demos/segformer/reference/segformer_encoder.py @@ -5,10 +5,10 @@ import torch import torch.nn as nn from transformers.modeling_outputs import BaseModelOutput -from models.experimental.functional_segformer.reference.segformer_overlap_patch_embeddings import ( +from models.demos.segformer.reference.segformer_overlap_patch_embeddings import ( SegformerOverlapPatchEmbeddings, ) -from models.experimental.functional_segformer.reference.segformer_layer import SegformerLayer +from models.demos.segformer.reference.segformer_layer import SegformerLayer from typing import Optional, Tuple, Union diff --git a/models/experimental/functional_segformer/reference/segformer_for_semantic_segmentation.py b/models/demos/segformer/reference/segformer_for_semantic_segmentation.py similarity index 95% rename from models/experimental/functional_segformer/reference/segformer_for_semantic_segmentation.py rename to models/demos/segformer/reference/segformer_for_semantic_segmentation.py index 59b90ffebb4..a311c413840 100644 --- a/models/experimental/functional_segformer/reference/segformer_for_semantic_segmentation.py +++ b/models/demos/segformer/reference/segformer_for_semantic_segmentation.py @@ -8,8 +8,8 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss from transformers.modeling_outputs import SemanticSegmenterOutput from transformers.models.segformer.modeling_segformer import SegformerPreTrainedModel -from models.experimental.functional_segformer.reference.segformer_model import SegformerModelReference -from models.experimental.functional_segformer.reference.segformer_decode_head import SegformerDecodeHead +from models.demos.segformer.reference.segformer_model import SegformerModelReference +from models.demos.segformer.reference.segformer_decode_head import SegformerDecodeHead class SegformerForSemanticSegmentationReference(SegformerPreTrainedModel): diff --git a/models/experimental/functional_segformer/reference/segformer_layer.py b/models/demos/segformer/reference/segformer_layer.py similarity index 89% rename from models/experimental/functional_segformer/reference/segformer_layer.py rename to models/demos/segformer/reference/segformer_layer.py index 814815816eb..62ce7212da7 100644 --- a/models/experimental/functional_segformer/reference/segformer_layer.py +++ b/models/demos/segformer/reference/segformer_layer.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import torch.nn as nn -from models.experimental.functional_segformer.reference.segformer_attention import SegformerAttention -from models.experimental.functional_segformer.reference.segformer_mixffn import SegformerMixFFN +from models.demos.segformer.reference.segformer_attention import SegformerAttention +from models.demos.segformer.reference.segformer_mixffn import SegformerMixFFN class SegformerLayer(nn.Module): diff --git a/models/experimental/functional_segformer/reference/segformer_mixffn.py b/models/demos/segformer/reference/segformer_mixffn.py similarity index 91% rename from models/experimental/functional_segformer/reference/segformer_mixffn.py rename to models/demos/segformer/reference/segformer_mixffn.py index 21d2627b206..d8760292a80 100644 --- a/models/experimental/functional_segformer/reference/segformer_mixffn.py +++ b/models/demos/segformer/reference/segformer_mixffn.py @@ -4,7 +4,7 @@ import torch.nn as nn from transformers.activations import ACT2FN -from models.experimental.functional_segformer.reference.segformer_dwconv import SegformerDWConv +from models.demos.segformer.reference.segformer_dwconv import SegformerDWConv class SegformerMixFFN(nn.Module): diff --git a/models/experimental/functional_segformer/reference/segformer_mlp.py b/models/demos/segformer/reference/segformer_mlp.py similarity index 100% rename from models/experimental/functional_segformer/reference/segformer_mlp.py rename to models/demos/segformer/reference/segformer_mlp.py diff --git a/models/experimental/functional_segformer/reference/segformer_model.py b/models/demos/segformer/reference/segformer_model.py similarity index 95% rename from models/experimental/functional_segformer/reference/segformer_model.py rename to models/demos/segformer/reference/segformer_model.py index e24d246d457..fa7e0dc5a7c 100644 --- a/models/experimental/functional_segformer/reference/segformer_model.py +++ b/models/demos/segformer/reference/segformer_model.py @@ -7,7 +7,7 @@ from typing import Union, Tuple, Optional from transformers.modeling_outputs import BaseModelOutput from transformers.models.segformer.modeling_segformer import SegformerPreTrainedModel -from models.experimental.functional_segformer.reference.segformer_encoder import SegformerEncoder +from models.demos.segformer.reference.segformer_encoder import SegformerEncoder class SegformerModelReference(SegformerPreTrainedModel): diff --git a/models/experimental/functional_segformer/reference/segformer_overlap_patch_embeddings.py b/models/demos/segformer/reference/segformer_overlap_patch_embeddings.py similarity index 100% rename from models/experimental/functional_segformer/reference/segformer_overlap_patch_embeddings.py rename to models/demos/segformer/reference/segformer_overlap_patch_embeddings.py diff --git a/models/experimental/functional_segformer/reference/segformer_selfoutput.py b/models/demos/segformer/reference/segformer_selfoutput.py similarity index 100% rename from models/experimental/functional_segformer/reference/segformer_selfoutput.py rename to models/demos/segformer/reference/segformer_selfoutput.py diff --git a/models/experimental/functional_segformer/tt/common.py b/models/demos/segformer/tt/common.py similarity index 84% rename from models/experimental/functional_segformer/tt/common.py rename to models/demos/segformer/tt/common.py index 85de8856df6..5f52fe0e507 100644 --- a/models/experimental/functional_segformer/tt/common.py +++ b/models/demos/segformer/tt/common.py @@ -18,7 +18,7 @@ def __init__( height_sharding=True, activation="", groups=1, - dtype=ttnn.bfloat16, + dtype=ttnn.bfloat8_b, ) -> None: self.weights = parameters["weight"] self.bias = parameters["bias"] @@ -73,15 +73,5 @@ def __call__(self, device, input_tensor): conv_config=conv_config, groups=self.groups, ) - ## TODO: Op | WARNING | Tensor at index 0 is not allocated - # print("sr2a", output_tensor.shape) - - # output_tensor = ttnn.from_device(output_tensor) - # output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT) - - # output_tensor = ttnn.reshape( - # output_tensor, (input_tensor.shape[0], _out_height, _out_width, output_tensor.shape[3]) - # ) - # del _out_height, _out_width return output_tensor, _out_height, _out_width diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_attention.py b/models/demos/segformer/tt/ttnn_segformer_attention.py similarity index 76% rename from models/experimental/functional_segformer/tt/ttnn_segformer_attention.py rename to models/demos/segformer/tt/ttnn_segformer_attention.py index e37a1a5b706..7ec2aacfc9b 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_attention.py +++ b/models/demos/segformer/tt/ttnn_segformer_attention.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 -from models.experimental.functional_segformer.tt.ttnn_segformer_efficient_selfattention import ( +from models.demos.segformer.tt.ttnn_segformer_efficient_selfattention import ( TtSegformerEfficientSelfAttention, ) -from models.experimental.functional_segformer.tt.ttnn_segformer_selfoutput import TtSegformerSelfOutput +from models.demos.segformer.tt.ttnn_segformer_selfoutput import TtSegformerSelfOutput import ttnn @@ -22,9 +22,8 @@ def __init__(self, hidden_size, num_attention_heads, parameters, sequence_reduct def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, parameters, output_attentions=False): self_outputs = self.self(hidden_states, height, width, parameters.self, output_attentions) - attention_output = self.output(self_outputs[0], parameters.output) - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + outputs = (attention_output,) + self_outputs[1:] ttnn.deallocate(self_outputs[0]) return outputs diff --git a/models/demos/segformer/tt/ttnn_segformer_decode_head.py b/models/demos/segformer/tt/ttnn_segformer_decode_head.py new file mode 100644 index 00000000000..6aed216c578 --- /dev/null +++ b/models/demos/segformer/tt/ttnn_segformer_decode_head.py @@ -0,0 +1,110 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import math +import ttnn +from models.demos.segformer.tt.ttnn_segformer_mlp import TtSegformerMLP +from torch import nn +import tt_lib +from models.demos.segformer.tt.common import Conv +from tests.ttnn.ttnn_utility_fuction import get_shard_grid_from_num_cores + + +def torch_to_ttnn(input, device, layout=ttnn.TILE_LAYOUT): + input = ttnn.from_torch(input, ttnn.bfloat8_b) + input = ttnn.to_layout(input, layout) + input = ttnn.to_device(input, device) + return input + + +def ttnn_to_torch(input): + input = ttnn.to_layout(input, ttnn.ROW_MAJOR_LAYOUT) + input = ttnn.from_device(input) + input = ttnn.to_torch(input) + return input + + +class TtSegformerDecodeHead: + def __init__(self, config, parameters): + super().__init__() + # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size + mlps = [] + for i in range(config.num_encoder_blocks): + mlp = TtSegformerMLP() + mlps.append(mlp) + self.linear_c = mlps + + self.linear_fuse = Conv([1, 1, 0, 0], parameters=parameters["linear_fuse"], activation="relu", deallocate=False) + + self.classifier = Conv( + [1, 1, 0, 0], + parameters=parameters["classifier"], + ) + + self.config = config + + def __call__(self, encoder_hidden_states: ttnn.bfloat8_b, parameters) -> ttnn.Tensor: + device = encoder_hidden_states[-1].device() + batch_size = encoder_hidden_states[-1].shape[0] + + all_hidden_states = () + concated_tensor = 0 + index = 0 + for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c): + height = width = int(math.sqrt(encoder_hidden_state.shape[-2])) + encoder_hidden_state = mlp(encoder_hidden_state, parameters=parameters["linear_c"][index]) + encoder_hidden_state = ttnn.to_layout(encoder_hidden_state, layout=ttnn.ROW_MAJOR_LAYOUT) + encoder_hidden_state = ttnn.reshape(encoder_hidden_state, (batch_size, height, width, -1)) + + if encoder_hidden_state.shape[-2] == 16: + ncores = 16 + elif encoder_hidden_state.shape[-2] == 32: + ncores = 32 + else: + ncores = 64 + + shard_grid = get_shard_grid_from_num_cores(ncores, device) + shard_orientation = ttnn.ShardOrientation.ROW_MAJOR + shard_height = math.ceil( + encoder_hidden_state.shape[0] * encoder_hidden_state.shape[1] * encoder_hidden_state.shape[2] / ncores + ) + shard_width = encoder_hidden_state.shape[3] + shard_spec = ttnn.ShardSpec(shard_grid, (shard_height, shard_width), shard_orientation, False) + input_memory_config = ttnn.MemoryConfig( + ttnn.types.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.types.BufferType.L1, shard_spec + ) + encoder_hidden_state = ttnn.to_memory_config(encoder_hidden_state, memory_config=input_memory_config) + + encoder_hidden_state = ttnn.upsample( + encoder_hidden_state, + scale_factor=(128 // encoder_hidden_state.shape[2], 128 // encoder_hidden_state.shape[2], 1), + mode="bilinear", + ) + + encoder_hidden_state_to_concat = ttnn.to_memory_config( + encoder_hidden_state, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16 + ) + + ttnn.deallocate(encoder_hidden_state) + encoder_hidden_state_to_concat = ttnn.reallocate(encoder_hidden_state_to_concat) + + all_hidden_states += (encoder_hidden_state_to_concat,) + + index += 1 + + concated_tensor = ttnn.concat(all_hidden_states[::-1], dim=3, memory_config=ttnn.L1_MEMORY_CONFIG) + ttnn.deallocate(all_hidden_states[0]) + ttnn.deallocate(all_hidden_states[1]) + ttnn.deallocate(all_hidden_states[2]) + ttnn.deallocate(all_hidden_states[3]) + concated_tensor = ttnn.reallocate(concated_tensor) + + concated_tensor_tile = ttnn.to_layout(concated_tensor, ttnn.TILE_LAYOUT, dtype=ttnn.bfloat8_b) + ttnn.deallocate(concated_tensor) + concated_tensor_tile = ttnn.reallocate(concated_tensor_tile) + + hidden_states, __, __ = self.linear_fuse(device, concated_tensor_tile) + logits, __, __ = self.classifier(device, hidden_states) + + return logits diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_dwconv.py b/models/demos/segformer/tt/ttnn_segformer_dwconv.py similarity index 71% rename from models/experimental/functional_segformer/tt/ttnn_segformer_dwconv.py rename to models/demos/segformer/tt/ttnn_segformer_dwconv.py index ad0e43dd42e..d63563d760a 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_dwconv.py +++ b/models/demos/segformer/tt/ttnn_segformer_dwconv.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import ttnn -from models.experimental.functional_segformer.tt.common import Conv +from models.demos.segformer.tt.common import Conv class TtSegformerDWConv: @@ -37,8 +37,6 @@ def __call__( width: int, device, ): - # print("dw0", hidden_states.shape) - if len(hidden_states.shape) == 3: batch_size, seq_len, num_channels = hidden_states.shape elif len(hidden_states.shape) == 4: @@ -47,15 +45,6 @@ def __call__( hidden_states = ttnn.to_layout(hidden_states, ttnn.ROW_MAJOR_LAYOUT) hidden_states = ttnn.reshape(hidden_states, (batch_size, height, width, num_channels)) - # print("dw1", hidden_states.shape) hidden_states = self.dwconv(device, hidden_states) - # print("dw2", hidden_states.shape) - # hidden_states = ttnn.reshape( - # hidden_states, - # (hidden_states.shape[0], hidden_states.shape[1] * hidden_states.shape[2], hidden_states.shape[3]), - # ) - # hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.TILE_LAYOUT) - # hidden_states = ttnn.to_device(hidden_states, device=device) - # print("dw3", hidden_states.shape) return hidden_states diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_efficient_selfattention.py b/models/demos/segformer/tt/ttnn_segformer_efficient_selfattention.py similarity index 61% rename from models/experimental/functional_segformer/tt/ttnn_segformer_efficient_selfattention.py rename to models/demos/segformer/tt/ttnn_segformer_efficient_selfattention.py index b4dbade4a3d..b2d9e2f8fb2 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_efficient_selfattention.py +++ b/models/demos/segformer/tt/ttnn_segformer_efficient_selfattention.py @@ -4,7 +4,7 @@ import ttnn import math -from models.experimental.functional_segformer.tt.common import Conv +from models.demos.segformer.tt.common import Conv class TtSegformerEfficientSelfAttention: @@ -25,31 +25,6 @@ def __init__(self, hidden_size, num_attention_heads, parameters, sequence_reduct if sequence_reduction_ratio > 1: self.sr = Conv([sequence_reduction_ratio, sequence_reduction_ratio, 0, 0], parameters["sr"]) - def transpose_for_scores(self, hidden_states): - # new_shape = tuple(hidden_states.shape)[:-1] + (self.num_attention_heads, self.attention_head_size) - new_shape = ( - hidden_states.shape[0], - hidden_states.shape[-2], - self.num_attention_heads, - self.attention_head_size, - ) - device = hidden_states.device() - hidden_states = ttnn.from_device(hidden_states) - hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.ROW_MAJOR_LAYOUT) - hidden_states = ttnn.reshape(hidden_states, new_shape) - hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.TILE_LAYOUT) - hidden_states = ttnn.to_device(hidden_states, device) - - if len(hidden_states.shape) == 5: - output = ttnn.permute(hidden_states, (0, 1, 3, 2, 4)) - elif len(hidden_states.shape) == 4: - output = ttnn.permute(hidden_states, (0, 2, 1, 3)) - if len(hidden_states.shape) == 3: - output = ttnn.permute(hidden_states, (0, 2, 1)) - ttnn.deallocate(hidden_states) - - return output - def __call__( self, hidden_states: ttnn.Tensor, @@ -60,12 +35,17 @@ def __call__( ): device = hidden_states.device() + if len(hidden_states.shape) == 4: + batch_size, __, seq_len, hidden_size = hidden_states.shape + elif len(hidden_states.shape) == 3: + batch_size, seq_len, hidden_size = hidden_states.shape + mm_a_x_strategy = ttnn.ShardStrategy.HEIGHT mm_a_x_memory_config = ttnn.L1_HEIGHT_SHARDED_MEMORY_CONFIG mm_d_x_strategy = mm_a_x_strategy mm_d_x_memory_config = mm_a_x_memory_config mm_a_y = 8 - if (hidden_states.shape[-2] == 256) and (hidden_states.shape[-1] == 256): + if (seq_len == 256) and (hidden_size == 256): mm_a_x = 8 mm_b_x = 8 mm_d_x = 2 @@ -73,7 +53,7 @@ def __call__( mm_e_x = 8 mm_a_x_strategy = ttnn.ShardStrategy.BLOCK mm_a_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG - elif (hidden_states.shape[-2] == 1024) and (hidden_states.shape[-1] == 160): + elif (seq_len == 1024) and (hidden_size == 160): mm_a_x = 5 mm_b_x = 5 mm_d_x = 5 @@ -81,21 +61,19 @@ def __call__( mm_e_x = 8 mm_a_x_strategy = ttnn.ShardStrategy.BLOCK mm_a_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG - elif (hidden_states.shape[-2] == 4096) and (hidden_states.shape[-1] == 64): + elif (seq_len == 4096) and (hidden_size == 64): mm_a_x = 8 # 8 mm_b_x = 1 # 1 mm_d_x = 4 mm_d_y = 8 mm_e_x = 8 - elif (hidden_states.shape[-2] == 16384) and (hidden_states.shape[-1] == 32): + elif (seq_len == 16384) and (hidden_size == 32): mm_a_x = 8 # 8 mm_b_x = 1 # 1 - mm_d_x = 4 + mm_d_x = 8 mm_d_y = 8 mm_e_x = 8 - # print("mm-1--", hidden_states.shape, parameters.query.weight.shape) - hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT) hidden_states = ttnn.to_memory_config( hidden_states, @@ -116,33 +94,31 @@ def __call__( dtype=ttnn.bfloat8_b, ) - # print("Q1", query.shape) - hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) - query = ttnn.to_memory_config(query, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) + query = ttnn.to_memory_config(query, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) # Split Heads if self.num_attention_heads == 1: query_layer = query else: - query_layer = self.transpose_for_scores(query) - - # print("Q2", query_layer.shape) + query_layer = ttnn.experimental.nlp_create_qkv_heads_segformer(query, memory_config=ttnn.L1_MEMORY_CONFIG)[ + 0 + ] - # print("sr0", hidden_states.shape) if self.sr_ratio > 1: if len(hidden_states.shape) == 3: batch_size, seq_len, num_channels = hidden_states.shape elif len(hidden_states.shape) == 4: batch_size, __, seq_len, num_channels = hidden_states.shape + # Need for RM input to reshape, then back to TILE after that + hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.ROW_MAJOR_LAYOUT) hidden_states = ttnn.reshape(hidden_states, (batch_size, height, width, num_channels)) + hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT) + hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) - # print("sr1", hidden_states.shape) hidden_states, __, __ = self.sr(device, hidden_states) - # print("sr2", hidden_states.shape) hidden_states = ttnn.to_memory_config(hidden_states, memory_config=ttnn.L1_MEMORY_CONFIG) - # print("sr3", hidden_states.shape) hidden_states = ttnn.layer_norm( hidden_states, weight=parameters.layer_norm.weight, @@ -150,8 +126,6 @@ def __call__( memory_config=ttnn.L1_MEMORY_CONFIG, ) - # print("mm-2--", hidden_states.shape, parameters.key.weight.shape) - hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT) hidden_states = ttnn.to_memory_config( hidden_states, @@ -171,18 +145,14 @@ def __call__( core_grid=ttnn.CoreGrid(y=mm_a_y, x=mm_b_x), dtype=ttnn.bfloat8_b, ) - # hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) - # print("K1", key.shape) - key = ttnn.to_memory_config(key, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) + key = ttnn.to_memory_config(key, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) if self.num_attention_heads == 1: key_layer = key else: - key_layer = self.transpose_for_scores(key) + key_layer = ttnn.experimental.nlp_create_qkv_heads_segformer(key, memory_config=ttnn.L1_MEMORY_CONFIG)[0] key_layer = ttnn.permute(key_layer, (0, 1, 3, 2)) - # print("K2", key_layer.shape) - # print("mm-3--", hidden_states.shape, parameters.value.weight.shape) value = ttnn.linear( hidden_states, parameters.value.weight, @@ -191,17 +161,14 @@ def __call__( core_grid=ttnn.CoreGrid(y=mm_a_y, x=mm_b_x), dtype=ttnn.bfloat8_b, ) - # hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) ttnn.deallocate(hidden_states) - # print("V1", value.shape) - value = ttnn.to_memory_config(value, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) + value = ttnn.to_memory_config(value, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) if self.num_attention_heads == 1: value_layer = value else: - value_layer = self.transpose_for_scores(value) - # print("V2", value_layer.shape) - - # print("mm-4--", query_layer.shape, key_layer.shape) + value_layer = ttnn.experimental.nlp_create_qkv_heads_segformer(value, memory_config=ttnn.L1_MEMORY_CONFIG)[ + 0 + ] key_layer = ttnn.to_memory_config(key_layer, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) query_layer = ttnn.to_layout(query_layer, ttnn.TILE_LAYOUT) @@ -225,26 +192,14 @@ def __call__( ttnn.deallocate(query_layer) ttnn.deallocate(key_layer) - attention_scores = ttnn.to_memory_config(attention_scores, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) - denominator_value = ttnn.ones( - attention_scores.shape, layout=ttnn.TILE_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG - ) - denominator_value = denominator_value * math.sqrt(self.attention_head_size) - denominator_value = ttnn.reciprocal(denominator_value) - attention_scores = attention_scores * denominator_value - - # Normalize the attention scores to probabilities. + attention_scores = ttnn.to_memory_config(attention_scores, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) + scale_value = self.attention_head_size**-0.5 + attention_scores = ttnn.multiply(attention_scores, scale_value) attention_probs = ttnn.softmax(attention_scores, dim=-1, memory_config=ttnn.L1_MEMORY_CONFIG) ttnn.deallocate(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - # attention_probs = self.dropout(attention_probs) - - # print("mm-5--", attention_probs.shape, value_layer.shape) - attention_probs = ttnn.to_layout(attention_probs, ttnn.TILE_LAYOUT) + value_layer = ttnn.to_memory_config(value_layer, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) attention_probs = ttnn.to_memory_config( @@ -262,8 +217,6 @@ def __call__( value_layer, memory_config=mm_d_x_memory_config, dtype=ttnn.bfloat8_b, - # core_grid=ttnn.CoreGrid(y=8, x=8), - # program_config=ATTN_SCORE_MM_PROGCFG, ) ttnn.deallocate(value) ttnn.deallocate(value_layer) @@ -271,23 +224,13 @@ def __call__( if not output_attentions: ttnn.deallocate(attention_probs) else: - attention_probs = ttnn.to_memory_config(attention_probs, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) - # context_layer = ttnn.to_memory_config(context_layer, ttnn.DRAM_MEMORY_CONFIG, dtype=ttnn.bfloat16) - context_layer = ttnn.to_memory_config(context_layer, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) - # print("cxt1", context_layer.shape) + attention_probs = ttnn.to_memory_config(attention_probs, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) if self.num_attention_heads > 1: - context_layer = ttnn.permute(context_layer, (0, 2, 1, 3)) - context_layer = ttnn.to_memory_config( - context_layer, ttnn.L1_MEMORY_CONFIG - ) # This throws OOM issue while runnning whole_model so, DRAM memory config is used. - new_context_layer_shape = tuple(context_layer.shape)[:-2] + (self.all_head_size,) - context_layer = ttnn.from_device(context_layer) - context_layer = ttnn.to_layout(context_layer, layout=ttnn.ROW_MAJOR_LAYOUT) - context_layer = ttnn.reshape(context_layer, new_context_layer_shape) - context_layer = ttnn.to_device(context_layer, device) - context_layer = ttnn.to_layout(context_layer, layout=ttnn.TILE_LAYOUT) - # print("cxt2", context_layer.shape) + context_layer = ttnn.to_memory_config(context_layer, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) + context_layer = ttnn.experimental.nlp_concat_heads( + context_layer, memory_config=ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1) + ) outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_encoder.py b/models/demos/segformer/tt/ttnn_segformer_encoder.py similarity index 84% rename from models/experimental/functional_segformer/tt/ttnn_segformer_encoder.py rename to models/demos/segformer/tt/ttnn_segformer_encoder.py index 66cfe272727..2aecb8c263a 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_encoder.py +++ b/models/demos/segformer/tt/ttnn_segformer_encoder.py @@ -3,10 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 import ttnn -from models.experimental.functional_segformer.tt.ttnn_segformer_overlap_patch_embeddings import ( +from models.demos.segformer.tt.ttnn_segformer_overlap_patch_embeddings import ( TtSegformerOverlapPatchEmbeddings, ) -from models.experimental.functional_segformer.tt.ttnn_segformer_layer import TtSegformerLayer +from models.demos.segformer.tt.ttnn_segformer_layer import TtSegformerLayer from typing import Optional, Tuple, Union from dataclasses import dataclass @@ -119,26 +119,18 @@ def __call__( math_fidelity=ttnn.MathFidelity.LoFi, ), ) - # pass the unflolded version to the Decoder if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - # print("e1", hidden_states.shape) - # fourth, optionally reshape back to (batch_size, num_channels, height, width) - # TODO: does the input to Conv need to be 4D? - if idx != len(self.patch_embeddings) - 1 or ( - idx == len(self.patch_embeddings) - 1 and self.config.reshape_last_stage - ): + if idx != len(self.patch_embeddings) - 1: + hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.ROW_MAJOR_LAYOUT) + hidden_states = ttnn.reshape(hidden_states, (batch_size, height, width, -1)) + elif idx == len(self.patch_embeddings) - 1 and self.config.reshape_last_stage: + # last stage hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.ROW_MAJOR_LAYOUT) hidden_states = ttnn.reshape(hidden_states, (batch_size, height, width, -1)) - hidden_states = ttnn.to_layout(hidden_states, layout=ttnn.TILE_LAYOUT) - hidden_states = ttnn.permute(hidden_states, (0, 3, 1, 2)) - # Original folded version is passed to the Decoder - # if output_hidden_states: - # all_hidden_states = all_hidden_states + (hidden_states,) - # print("e2", hidden_states.shape) if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) return TtBaseModelOutput( diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_for_semantic_segmentation.py b/models/demos/segformer/tt/ttnn_segformer_for_semantic_segmentation.py similarity index 91% rename from models/experimental/functional_segformer/tt/ttnn_segformer_for_semantic_segmentation.py rename to models/demos/segformer/tt/ttnn_segformer_for_semantic_segmentation.py index e5b889fcab6..09225c51a1b 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_for_semantic_segmentation.py +++ b/models/demos/segformer/tt/ttnn_segformer_for_semantic_segmentation.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import ttnn -from models.experimental.functional_segformer.tt.ttnn_segformer_decode_head import TtSegformerDecodeHead -from models.experimental.functional_segformer.tt.ttnn_segformer_model import TtSegformerModel +from models.demos.segformer.tt.ttnn_segformer_decode_head import TtSegformerDecodeHead +from models.demos.segformer.tt.ttnn_segformer_model import TtSegformerModel from typing import Tuple, Union, Optional from dataclasses import dataclass diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_layer.py b/models/demos/segformer/tt/ttnn_segformer_layer.py similarity index 92% rename from models/experimental/functional_segformer/tt/ttnn_segformer_layer.py rename to models/demos/segformer/tt/ttnn_segformer_layer.py index 983c935011e..77234293988 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_layer.py +++ b/models/demos/segformer/tt/ttnn_segformer_layer.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import ttnn -from models.experimental.functional_segformer.tt.ttnn_segformer_attention import TtSegformerAttention -from models.experimental.functional_segformer.tt.ttnn_segformer_mix_ffn import TtSegformerMixFFN +from models.demos.segformer.tt.ttnn_segformer_attention import TtSegformerAttention +from models.demos.segformer.tt.ttnn_segformer_mix_ffn import TtSegformerMixFFN class TtSegformerLayer: @@ -37,7 +37,6 @@ def __call__( parameters.attention, output_attentions=output_attentions, ) - attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_mix_ffn.py b/models/demos/segformer/tt/ttnn_segformer_mix_ffn.py similarity index 73% rename from models/experimental/functional_segformer/tt/ttnn_segformer_mix_ffn.py rename to models/demos/segformer/tt/ttnn_segformer_mix_ffn.py index 1a6ec3038e6..4cbc54c8cab 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_mix_ffn.py +++ b/models/demos/segformer/tt/ttnn_segformer_mix_ffn.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import ttnn -from models.experimental.functional_segformer.tt.ttnn_segformer_dwconv import TtSegformerDWConv +from models.demos.segformer.tt.ttnn_segformer_dwconv import TtSegformerDWConv class TtSegformerMixFFN: @@ -12,25 +12,28 @@ def __init__(self, parameters, hidden_features): self.dwconv = TtSegformerDWConv(parameters["dwconv"], hidden_features) def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, parameters, device): - # print("mm-7--", hidden_states.shape, parameters.dense1.weight.shape) + if len(hidden_states.shape) == 4: + batch_size, __, seq_len, hidden_size = hidden_states.shape + elif len(hidden_states.shape) == 3: + batch_size, seq_len, hidden_size = hidden_states.shape mm_f_x_strategy = ttnn.ShardStrategy.HEIGHT mm_f_x_memory_config = ttnn.L1_HEIGHT_SHARDED_MEMORY_CONFIG mm_f_y = 8 - if (hidden_states.shape[-2] == 256) and (hidden_states.shape[-1] == 256): - mm_f_x = 4 + if (seq_len == 256) and (hidden_size == 256): + mm_f_x = 8 mm_f_x_strategy = ttnn.ShardStrategy.BLOCK mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG - elif (hidden_states.shape[-2] == 1024) and (hidden_states.shape[-1] == 160): + elif (seq_len == 1024) and (hidden_size == 160): mm_f_x = 5 mm_f_x_strategy = ttnn.ShardStrategy.BLOCK mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG - elif (hidden_states.shape[-2] == 4096) and (hidden_states.shape[-1] == 64): + elif (seq_len == 4096) and (hidden_size == 64): mm_f_x = 2 mm_f_x_strategy = ttnn.ShardStrategy.BLOCK mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG - elif (hidden_states.shape[-2] == 16384) and (hidden_states.shape[-1] == 32): - mm_f_x = 4 + elif (seq_len == 16384) and (hidden_size == 32): + mm_f_x = 8 hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT) @@ -55,12 +58,8 @@ def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, paramete hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) hidden_states, __, __ = self.dwconv(hidden_states, height, width, device) - hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) - # TODO: GeLU on sharded data - hidden_states = ttnn.gelu(hidden_states, memory_config=ttnn.L1_MEMORY_CONFIG) - - # print("mm-8--", hidden_states.shape, parameters.dense2.weight.shape) - hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT) + # # TODO: GeLU on sharded data + hidden_states = ttnn.gelu(hidden_states) hidden_states = ttnn.to_memory_config( hidden_states, @@ -70,7 +69,6 @@ def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, paramete strategy=mm_f_x_strategy, orientation=ttnn.ShardOrientation.ROW_MAJOR, ), - dtype=ttnn.bfloat8_b, ) hidden_states = ttnn.linear( hidden_states, @@ -81,5 +79,5 @@ def __call__(self, hidden_states: ttnn.Tensor, height: int, width: int, paramete dtype=ttnn.bfloat8_b, ) - hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) + hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) return hidden_states diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_mlp.py b/models/demos/segformer/tt/ttnn_segformer_mlp.py similarity index 79% rename from models/experimental/functional_segformer/tt/ttnn_segformer_mlp.py rename to models/demos/segformer/tt/ttnn_segformer_mlp.py index da1efbde00e..34a8f7efeac 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_mlp.py +++ b/models/demos/segformer/tt/ttnn_segformer_mlp.py @@ -57,23 +57,6 @@ def __init__(self): def __call__(self, hidden_states: ttnn.Tensor, parameters): device = hidden_states.device() - if 0: - # print("mlp0", hidden_states.shape) - hidden_states = ttnn.from_device(hidden_states) - hidden_states = ttnn.to_layout(hidden_states, ttnn.ROW_MAJOR_LAYOUT) - hidden_states = ttnn.reshape( - hidden_states, - (hidden_states.shape[0], hidden_states.shape[1], hidden_states.shape[2] * hidden_states.shape[3]), - ) - hidden_states = ttnn.to_device(hidden_states, device=device) - hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT) - # print("mlp1", hidden_states.shape) - hidden_states = ttnn.permute(hidden_states, (0, 2, 1)) - # print("mlp2", hidden_states.shape) - if len(hidden_states.shape) == 2: # This is due to while permuting 1,x,y we are getting 2D as output - hidden_states = ttnn.reshape(hidden_states, (1, hidden_states.shape[0], hidden_states.shape[1])) - # print("mlp3", hidden_states.shape) - mm_f_x_strategy = ttnn.ShardStrategy.HEIGHT mm_f_x_memory_config = ttnn.L1_HEIGHT_SHARDED_MEMORY_CONFIG mm_f_y = 8 @@ -95,7 +78,6 @@ def __call__(self, hidden_states: ttnn.Tensor, parameters): mm_prog_config = program_configs["linear_config_16384"] hidden_states = ttnn.to_layout(hidden_states, ttnn.TILE_LAYOUT) - # print("---mlp MM----", hidden_states.shape, parameters.proj.weight.shape) if (hidden_states.shape[-2] == 1024) and (hidden_states.shape[-1] == 160): # TODO: convert it to sharding diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_model.py b/models/demos/segformer/tt/ttnn_segformer_model.py similarity index 95% rename from models/experimental/functional_segformer/tt/ttnn_segformer_model.py rename to models/demos/segformer/tt/ttnn_segformer_model.py index ff0c711d325..d325895ce8f 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_model.py +++ b/models/demos/segformer/tt/ttnn_segformer_model.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import ttnn -from models.experimental.functional_segformer.tt.ttnn_segformer_encoder import TtSegformerEncoder +from models.demos.segformer.tt.ttnn_segformer_encoder import TtSegformerEncoder from typing import Optional, Tuple, Union from dataclasses import dataclass diff --git a/models/demos/segformer/tt/ttnn_segformer_overlap_patch_embeddings.py b/models/demos/segformer/tt/ttnn_segformer_overlap_patch_embeddings.py new file mode 100644 index 00000000000..c81bfad5569 --- /dev/null +++ b/models/demos/segformer/tt/ttnn_segformer_overlap_patch_embeddings.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +from models.demos.segformer.tt.common import Conv + + +class TtSegformerOverlapPatchEmbeddings: + """Construct the overlapping patch embeddings.""" + + def __init__(self, parameters, stride, patch_size): + super().__init__() + self.proj = Conv([stride, stride, patch_size // 2, patch_size // 2], parameters=parameters["proj"]) + + def __call__( + self, + pixel_values: ttnn.Tensor, + parameters, + ): + device = pixel_values.device() + + if pixel_values.shape[-1] == 3: + pixel_values_rm = ttnn.from_device(pixel_values) + pixel_values_rm = ttnn.to_layout(pixel_values_rm, layout=ttnn.ROW_MAJOR_LAYOUT) + else: + pixel_values_rm = pixel_values + + embeddings, input_height, input_width = self.proj(device, pixel_values_rm) + embeddings = ttnn.to_memory_config(embeddings, memory_config=ttnn.L1_MEMORY_CONFIG) + ttnn.deallocate(pixel_values) + embeddings = ttnn.reallocate(embeddings) + + embeddings = ttnn.layer_norm( + embeddings, + weight=parameters.layer_norm.weight, + bias=parameters.layer_norm.bias, + memory_config=ttnn.L1_MEMORY_CONFIG, + compute_kernel_config=ttnn.WormholeComputeKernelConfig( + math_fidelity=ttnn.MathFidelity.LoFi, + ), + ) + + return embeddings, input_height, input_width diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_selfoutput.py b/models/demos/segformer/tt/ttnn_segformer_selfoutput.py similarity index 72% rename from models/experimental/functional_segformer/tt/ttnn_segformer_selfoutput.py rename to models/demos/segformer/tt/ttnn_segformer_selfoutput.py index 22083348634..daf1d89d525 100644 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_selfoutput.py +++ b/models/demos/segformer/tt/ttnn_segformer_selfoutput.py @@ -10,23 +10,26 @@ def __init__(self): super().__init__() def __call__(self, hidden_states: ttnn.Tensor, parameters): - # print("mm-6--", hidden_states.shape) + if len(hidden_states.shape) == 4: + batch_size, __, seq_len, hidden_size = hidden_states.shape + elif len(hidden_states.shape) == 3: + batch_size, seq_len, hidden_size = hidden_states.shape mm_f_x_strategy = ttnn.ShardStrategy.HEIGHT mm_f_x_memory_config = ttnn.L1_HEIGHT_SHARDED_MEMORY_CONFIG mm_f_y = 8 - if (hidden_states.shape[-2] == 256) and (hidden_states.shape[-1] == 256): - mm_f_x = 4 + if (seq_len == 256) and (hidden_size == 256): + mm_f_x = 8 mm_f_x_strategy = ttnn.ShardStrategy.BLOCK mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG - elif (hidden_states.shape[-2] == 1024) and (hidden_states.shape[-1] == 160): + elif (seq_len == 1024) and (hidden_size == 160): mm_f_x = 5 mm_f_x_strategy = ttnn.ShardStrategy.BLOCK mm_f_x_memory_config = ttnn.L1_BLOCK_SHARDED_MEMORY_CONFIG - elif (hidden_states.shape[-2] == 4096) and (hidden_states.shape[-1] == 64): - mm_f_x = 4 - elif (hidden_states.shape[-2] == 16384) and (hidden_states.shape[-1] == 32): - mm_f_x = 4 + elif (seq_len == 4096) and (hidden_size == 64): + mm_f_x = 8 + elif (seq_len == 16384) and (hidden_size == 32): + mm_f_x = 8 hidden_states = ttnn.to_memory_config( hidden_states, @@ -47,7 +50,6 @@ def __call__(self, hidden_states: ttnn.Tensor, parameters): dtype=ttnn.bfloat8_b, ) - hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) - # print("self", hidden_states.shape) + hidden_states = ttnn.to_memory_config(hidden_states, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) return hidden_states diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_decode_head.py b/models/experimental/functional_segformer/tt/ttnn_segformer_decode_head.py deleted file mode 100644 index 1a78b2acc02..00000000000 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_decode_head.py +++ /dev/null @@ -1,175 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import math -import ttnn -from models.experimental.functional_segformer.tt.ttnn_segformer_mlp import TtSegformerMLP -from torch import nn -import tt_lib -from models.experimental.functional_segformer.tt.common import Conv -from tests.ttnn.ttnn_utility_fuction import get_shard_grid_from_num_cores - - -def torch_to_ttnn(input, device, layout=ttnn.TILE_LAYOUT): - input = ttnn.from_torch(input, ttnn.bfloat16) - input = ttnn.to_layout(input, layout) - input = ttnn.to_device(input, device) - return input - - -def ttnn_to_torch(input): - input = ttnn.to_layout(input, ttnn.ROW_MAJOR_LAYOUT) - input = ttnn.from_device(input) - input = ttnn.to_torch(input) - return input - - -class TtSegformerDecodeHead: - def __init__(self, config, parameters): - super().__init__() - # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size - mlps = [] - for i in range(config.num_encoder_blocks): - mlp = TtSegformerMLP() - mlps.append(mlp) - self.linear_c = mlps - - self.linear_fuse = Conv( - [1, 1, 0, 0], - parameters=parameters["linear_fuse"], - activation="relu", - ) - - self.classifier = Conv( - [1, 1, 0, 0], - parameters=parameters["classifier"], - ) - - self.config = config - - def __call__(self, encoder_hidden_states: ttnn.bfloat16, parameters) -> ttnn.Tensor: - device = encoder_hidden_states[-1].device() - batch_size = encoder_hidden_states[-1].shape[0] - - all_hidden_states = () - index = 0 - for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c): - # This condition was the original PyTorch graph with the redundant TMs to unfold/fold/unfold - if 0: - if self.config.reshape_last_stage is False and (encoder_hidden_state.shape) == 3: - height = width = int(math.sqrt(encoder_hidden_state.shape[-1])) - encoder_hidden_state = ttnn.reshape(encoder_hidden_state, (batch_size, height, width, -1)) - encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 3, 1, 2)) - height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3] - else: - height = width = int(math.sqrt(encoder_hidden_state.shape[-2])) - - # print("iii", index) - # print("ee0", encoder_hidden_state.shape) - encoder_hidden_state = mlp(encoder_hidden_state, parameters=parameters["linear_c"][index]) - - # print("ee1", encoder_hidden_state.shape) - encoder_hidden_state = ttnn.to_layout(encoder_hidden_state, layout=ttnn.ROW_MAJOR_LAYOUT) - encoder_hidden_state = ttnn.reshape(encoder_hidden_state, (batch_size, height, width, -1)) - - ## Original TM but I replaced it with one reshape in the previous line. - ## To revisit to investigate on the low PCC - """ - if len(encoder_hidden_state.shape) == 3: - encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 2, 1)) - else: - encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 1, 3, 2)) - print("ee2", encoder_hidden_state.shape) - encoder_hidden_state = ttnn.from_device(encoder_hidden_state) - encoder_hidden_state = ttnn.to_layout(encoder_hidden_state, layout=ttnn.ROW_MAJOR_LAYOUT) - encoder_hidden_state = ttnn.reshape(encoder_hidden_state, (batch_size, -1, height, width)) - encoder_hidden_state = ttnn.to_device(encoder_hidden_state, device) - print("ee3", encoder_hidden_state.shape) - encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 2, 3, 1)) - print("ee4", encoder_hidden_state.shape) - """ - - if encoder_hidden_state.shape[-2] == 16: - ncores = 16 - elif encoder_hidden_state.shape[-2] == 32: - ncores = 32 - else: - ncores = 64 - - shard_grid = get_shard_grid_from_num_cores(ncores, device) - shard_orientation = ttnn.ShardOrientation.ROW_MAJOR - - shard_height = math.ceil( - encoder_hidden_state.shape[0] * encoder_hidden_state.shape[1] * encoder_hidden_state.shape[2] / ncores - ) - shard_width = encoder_hidden_state.shape[3] - shard_spec = ttnn.ShardSpec(shard_grid, (shard_height, shard_width), shard_orientation, False) - input_memory_config = ttnn.MemoryConfig( - ttnn.types.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.types.BufferType.L1, shard_spec - ) - encoder_hidden_state = ttnn.to_memory_config(encoder_hidden_state, memory_config=input_memory_config) - - # print(shard_spec) - # print("up", encoder_hidden_state.shape) - encoder_hidden_state = ttnn.upsample( - encoder_hidden_state, - scale_factor=(128 // encoder_hidden_state.shape[2], 128 // encoder_hidden_state.shape[2], 1), - mode="bilinear", - ) - - encoder_hidden_state = ttnn.to_memory_config( - encoder_hidden_state, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16 - ) - # print("up2", encoder_hidden_state.shape) - # encoder_hidden_state = ttnn.permute(encoder_hidden_state, (0, 3, 1, 2)) - # print("up3", encoder_hidden_state.shape) - - # all_hidden_states += (encoder_hidden_state,) - # encoder_hidden_state = ttnn.to_layout(encoder_hidden_state, ttnn.TILE_LAYOUT) - if index == 0: - concated_tensor = encoder_hidden_state - else: - concated_tensor = ttnn.concat( - [concated_tensor, encoder_hidden_state], dim=3, memory_config=ttnn.L1_MEMORY_CONFIG - ) - ttnn.deallocate(encoder_hidden_state) - ttnn.reallocate(concated_tensor) - # print("conc", index, concated_tensor.shape) - index += 1 - - # Replaced with the phased concat of each 2 inputs, to handle the L1 OOM - ###concated_tensor = ttnn.concat(all_hidden_states[::-1], dim=1, memory_config=ttnn.L1_MEMORY_CONFIG) - - # ttnn.deallocate(encoder_hidden_state) - # concated_tensor = ttnn.to_memory_config(concated_tensor, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat8_b) - # ttnn.reallocate(concated_tensor) - - # print("conc", concated_tensor.shape) - - # concated_tensor = ttnn.to_layout(concated_tensor, layout=ttnn.ROW_MAJOR_LAYOUT) - # concated_tensor = ttnn.permute(concated_tensor, (0, 3, 1, 2)) - # concated_tensor = ttnn.reshape(concated_tensor, (concated_tensor[0], concated_tensor[-1], concated_tensor[1], concated_tensor[2]) ) - - ttnn.reallocate(concated_tensor) - # print("conc_p", concated_tensor.shape) - - hidden_states, __, __ = self.linear_fuse(device, concated_tensor) - # print("c1", hidden_states.shape) - - logits, __, __ = self.classifier(device, hidden_states) - logits_shape = logits.shape - logits = ttnn.to_device(logits, device=device) - logits = ttnn.to_memory_config(logits, ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) - logits = ttnn.to_layout(logits, layout=ttnn.ROW_MAJOR_LAYOUT) - # print("logits", logits.shape) - h = w = int(math.sqrt(logits.shape[-2])) - logits = ttnn.reshape(logits, (logits_shape[0], h, w, logits_shape[-1])) - # print(logits.shape) - logits = ttnn.permute(logits, (0, 3, 1, 2)) - logits = ttnn.to_layout(logits, layout=ttnn.ROW_MAJOR_LAYOUT) - logits = logits[:, :150, :, :] # returns out_channel 160 instead of 150 - logits = ttnn.to_layout(logits, layout=ttnn.TILE_LAYOUT) - # logits are of shape (batch_size, num_labels, height/4, width/4) - - return logits diff --git a/models/experimental/functional_segformer/tt/ttnn_segformer_overlap_patch_embeddings.py b/models/experimental/functional_segformer/tt/ttnn_segformer_overlap_patch_embeddings.py deleted file mode 100644 index d20a3c1e11c..00000000000 --- a/models/experimental/functional_segformer/tt/ttnn_segformer_overlap_patch_embeddings.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import ttnn -from models.experimental.functional_segformer.tt.common import Conv - - -class TtSegformerOverlapPatchEmbeddings: - """Construct the overlapping patch embeddings.""" - - def __init__(self, parameters, stride, patch_size): - super().__init__() - self.proj = Conv([stride, stride, patch_size // 2, patch_size // 2], parameters=parameters["proj"]) - - def __call__( - self, - pixel_values: ttnn.Tensor, - parameters, - ): - # print("ov0", pixel_values.shape) - device = pixel_values.device() - - pixel_values = ttnn.permute(pixel_values, (0, 2, 3, 1)) - - if pixel_values.shape[3] == 3: - pixel_values = ttnn.from_device(pixel_values) - pixel_values = ttnn.to_layout(pixel_values, layout=ttnn.ROW_MAJOR_LAYOUT) - embeddings, input_height, input_width = self.proj(device, pixel_values) - embeddings = ttnn.to_memory_config(embeddings, memory_config=ttnn.L1_MEMORY_CONFIG) - ttnn.deallocate(pixel_values) - # print("ov1", embeddings.shape) - """ - #embeddings = ttnn.to_layout(embeddings, layout=ttnn.TILE_LAYOUT) - embeddings = ttnn.to_layout(embeddings, layout=ttnn.ROW_MAJOR_LAYOUT) - embeddings = ttnn.to_device(embeddings, device=device) - - embeddings = ttnn.permute(embeddings, (0, 3, 1, 2)) - batch_size, _, input_height, input_width = embeddings.shape - - ttnn.deallocate(pixel_values) - embeddings = ttnn.from_device(embeddings) - embeddings = ttnn.to_layout(embeddings, layout=ttnn.ROW_MAJOR_LAYOUT) - - print("ov2", embeddings.shape) - embeddings = ttnn.reshape( - embeddings, (embeddings.shape[0], embeddings.shape[1], embeddings.shape[2] * embeddings.shape[3]) - ) - embeddings = ttnn.to_layout(embeddings, layout=ttnn.TILE_LAYOUT) - embeddings = ttnn.to_device(embeddings, device) - - print("ov3", embeddings.shape) - - embeddings = ttnn.permute(embeddings, (0, 2, 1)) - if len(embeddings.shape) == 2: - embeddings = ttnn.reshape(embeddings, (1, embeddings.shape[0], embeddings.shape[1])) - - print("ov4", embeddings.shape) - """ - - embeddings = ttnn.layer_norm( - embeddings, - weight=parameters.layer_norm.weight, - bias=parameters.layer_norm.bias, - memory_config=ttnn.L1_MEMORY_CONFIG, - compute_kernel_config=ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - ), - ) - - # print("ov5", embeddings.shape) - - return embeddings, input_height, input_width diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py new file mode 100644 index 00000000000..0b5dd234fec --- /dev/null +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_segformer.py @@ -0,0 +1,110 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from loguru import logger + +from models.utility_functions import tt2torch_tensor, comp_pcc +from models.utility_functions import is_grayskull +import torch +import ttnn + +""" +Segformer shapes + functionality +""" + + +def run_nlp_create_qkv_heads_segformer_test(batch, seq_len, hidden_dim, dtype, in0_mem_config, out_mem_config, device): + torch.manual_seed(1234) + + in0_shape = [batch, 1, seq_len, hidden_dim] + + A = torch.randn(in0_shape) + + in0_t = ttnn.Tensor(A, dtype).to(ttnn.TILE_LAYOUT).to(device, in0_mem_config) + + q = ttnn.experimental.nlp_create_qkv_heads_segformer(in0_t, memory_config=out_mem_config)[0] + + # Check memory of inputs and outputs + assert in0_t.memory_config().buffer_type == in0_mem_config.buffer_type + assert q.memory_config().buffer_type == out_mem_config.buffer_type + logger.debug(f"in0: {in0_t.memory_config().buffer_type} and {in0_t.get_dtype()}") + logger.debug(f"q: {q.memory_config().buffer_type} and {q.get_dtype()}") + + head_dim = 32 + heads_num = hidden_dim // head_dim + assert list(q.shape.with_tile_padding()) == [batch, heads_num, seq_len, head_dim] + + pyt_got_back_rm_q = tt2torch_tensor(q) + + ref_q = A + # Additional shuffling for Q,K,V heads + ref_q = torch.reshape(ref_q, [batch, seq_len, heads_num, head_dim]).transpose(-3, -2) + + if dtype == ttnn.bfloat8_b: + pcc = 0.99 + else: + pcc = 1.0 + + passing_pcc_q, output_pcc_q = comp_pcc(pyt_got_back_rm_q, ref_q, pcc) + logger.debug(f"Q passing={passing_pcc_q}") + logger.debug(f"Q output pcc={output_pcc_q}") + assert passing_pcc_q + + +@pytest.mark.parametrize( + "out_mem_config", + ( + ttnn.DRAM_MEMORY_CONFIG, + ttnn.L1_MEMORY_CONFIG, + ), + ids=["out_DRAM", "out_L1"], +) +@pytest.mark.parametrize( + "in0_mem_config", + ( + ttnn.DRAM_MEMORY_CONFIG, + ttnn.L1_MEMORY_CONFIG, + ), + ids=["in0_DRAM", "in0_L1"], +) +@pytest.mark.parametrize( + "dtype", + (ttnn.bfloat8_b, ttnn.bfloat16), + ids=["BFLOAT8_B", "BFLOAT16"], +) +@pytest.mark.parametrize( + "batch, seq_len, hidden_dim", + ((1, 4096, 64), (1, 1024, 160), (1, 256, 256)), + ids=[ + "batch1_seq4k", + "batch1_seq1k", + "batch1_seq256", + ], +) +def test_nlp_create_qkv_heads_segformer_test( + batch, seq_len, hidden_dim, dtype, in0_mem_config, out_mem_config, request, device +): + if is_grayskull() and dtype == ttnn.float32: + pytest.skip("Skipping float32 tests on Grayskull") + run_nlp_create_qkv_heads_segformer_test(batch, seq_len, hidden_dim, dtype, in0_mem_config, out_mem_config, device) + + +def test_nlp_create_qkv_heads_segformer_with_program_cache(device, use_program_cache): + dtype = ttnn.bfloat8_b + mem_config = ttnn.DRAM_MEMORY_CONFIG + for _ in range(2): + run_nlp_create_qkv_heads_segformer_test(1, 32, 32, dtype, mem_config, mem_config, device) + dummy_shape = [1, 1, 32, 32] + py_dummy_tensor = torch.randn(dummy_shape) + tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + mem_config = ttnn.L1_MEMORY_CONFIG + for _ in range(2): + run_nlp_create_qkv_heads_segformer_test(1, 32, 32, dtype, mem_config, mem_config, device) + dummy_shape = [1, 1, 32, 32] + py_dummy_tensor = torch.randn(dummy_shape) + tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py new file mode 100644 index 00000000000..28d369a3ef9 --- /dev/null +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_nlp_create_qkv_heads_vit.py @@ -0,0 +1,123 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from loguru import logger + +from models.utility_functions import tt2torch_tensor, comp_pcc +from models.utility_functions import is_grayskull +import torch +import ttnn + +""" +ViT shapes + functionality +""" + + +def run_nlp_create_qkv_heads_vit_test(batch, seq_len, dtype, in0_mem_config, out_mem_config, device): + torch.manual_seed(1234) + + in0_shape = [batch, 1, seq_len, 2304] + + A = torch.randn(in0_shape) + + in0_t = ttnn.Tensor(A, dtype).to(ttnn.TILE_LAYOUT).to(device, in0_mem_config) + + q, k, v = ttnn.experimental.nlp_create_qkv_heads_vit(in0_t, memory_config=out_mem_config) + + # Check memory of inputs and outputs + assert in0_t.memory_config().buffer_type == in0_mem_config.buffer_type + assert q.memory_config().buffer_type == out_mem_config.buffer_type + assert k.memory_config().buffer_type == out_mem_config.buffer_type + assert v.memory_config().buffer_type == out_mem_config.buffer_type + logger.debug(f"in0: {in0_t.memory_config().buffer_type} and {in0_t.get_dtype()}") + logger.debug(f"q: {q.memory_config().buffer_type} and {q.get_dtype()}") + logger.debug(f"k: {k.memory_config().buffer_type} and {k.get_dtype()}") + logger.debug(f"v: {v.memory_config().buffer_type} and {v.get_dtype()}") + + assert list(q.shape.with_tile_padding()) == [batch, 12, seq_len, 64] + assert list(k.shape.with_tile_padding()) == [batch, 12, seq_len, 64] + assert list(v.shape.with_tile_padding()) == [batch, 12, seq_len, 64] + + pyt_got_back_rm_q = tt2torch_tensor(q) + pyt_got_back_rm_k = tt2torch_tensor(k) + pyt_got_back_rm_v = tt2torch_tensor(v) + + (ref_q, ref_k, ref_v) = torch.split(A, [768, 768, 768], dim=-1) + # Additional shuffling for Q,K,V heads + ref_q = torch.reshape(ref_q, [batch, seq_len, 12, 64]).transpose(-3, -2) + ref_k = torch.reshape(ref_k, [batch, seq_len, 12, 64]).transpose(-3, -2) + ref_v = torch.reshape(ref_v, [batch, seq_len, 12, 64]).transpose(-3, -2) + + if dtype == ttnn.bfloat8_b: + pcc = 0.99 + else: + pcc = 1.0 + + passing_pcc_q, output_pcc_q = comp_pcc(pyt_got_back_rm_q, ref_q, pcc) + logger.debug(f"Q passing={passing_pcc_q}") + logger.debug(f"Q output pcc={output_pcc_q}") + assert passing_pcc_q + passing_pcc_k, output_pcc_k = comp_pcc(pyt_got_back_rm_k, ref_k, pcc) + logger.debug(f"K passing={passing_pcc_k}") + logger.debug(f"K output pcc={output_pcc_k}") + assert passing_pcc_k + passing_pcc_v, output_pcc_v = comp_pcc(pyt_got_back_rm_v, ref_v, pcc) + logger.debug(f"V passing={passing_pcc_v}") + logger.debug(f"V output pcc={output_pcc_v}") + assert passing_pcc_v + + +@pytest.mark.parametrize( + "out_mem_config", + ( + ttnn.DRAM_MEMORY_CONFIG, + ttnn.L1_MEMORY_CONFIG, + ), + ids=["out_DRAM", "out_L1"], +) +@pytest.mark.parametrize( + "in0_mem_config", + ( + ttnn.DRAM_MEMORY_CONFIG, + ttnn.L1_MEMORY_CONFIG, + ), + ids=["in0_DRAM", "in0_L1"], +) +@pytest.mark.parametrize( + "dtype", + (ttnn.bfloat8_b, ttnn.bfloat16), + ids=["BFLOAT8_B", "BFLOAT16"], +) +@pytest.mark.parametrize( + "batch, seq_len", + ((1, 224), (1, 4096)), + ids=[ + "batch1_seq224", + "batch1_seq4k", + ], +) +def test_nlp_create_qkv_heads_vit_test(batch, seq_len, dtype, in0_mem_config, out_mem_config, request, device): + if is_grayskull() and dtype == ttnn.float32: + pytest.skip("Skipping float32 tests on Grayskull") + run_nlp_create_qkv_heads_vit_test(batch, seq_len, dtype, in0_mem_config, out_mem_config, device) + + +def test_nlp_create_qkv_heads_vit_with_program_cache(device, use_program_cache): + dtype = ttnn.bfloat8_b + mem_config = ttnn.DRAM_MEMORY_CONFIG + for _ in range(2): + run_nlp_create_qkv_heads_vit_test(1, 32, dtype, mem_config, mem_config, device) + dummy_shape = [1, 1, 32, 32] + py_dummy_tensor = torch.randn(dummy_shape) + tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + mem_config = ttnn.L1_MEMORY_CONFIG + for _ in range(2): + run_nlp_create_qkv_heads_vit_test(1, 32, dtype, mem_config, mem_config, device) + dummy_shape = [1, 1, 32, 32] + py_dummy_tensor = torch.randn(dummy_shape) + tt_dummy_tensor = ttnn.Tensor(py_dummy_tensor, dtype).to(ttnn.TILE_LAYOUT).to(device, mem_config) + + assert device.num_program_cache_entries() == 2 diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_attention.py b/tests/ttnn/integration_tests/segformer/test_segformer_attention.py index a2e35af0a46..4cc714506cb 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_attention.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_attention.py @@ -11,10 +11,10 @@ import pytest from tests.ttnn.utils_for_testing import assert_with_pcc from models.utility_functions import skip_for_grayskull -from models.experimental.functional_segformer.tt.ttnn_segformer_attention import ( +from models.demos.segformer.tt.ttnn_segformer_attention import ( TtSegformerAttention, ) -from models.experimental.functional_segformer.reference.segformer_attention import SegformerAttention +from models.demos.segformer.reference.segformer_attention import SegformerAttention from tests.ttnn.integration_tests.segformer.test_segformer_efficient_selfattention import ( create_custom_preprocessor as create_customer_preprocessor_selfattention, ) @@ -72,7 +72,7 @@ def test_segformer_attention( if is_ci_env: pytest.skip("Skip in CI, model is WIP, issue# 13357") - torch_input_tensor = torch.randn(batch_size, seq_len, hidden_size) + torch_input_tensor = torch.randn(batch_size, 1, seq_len, hidden_size) ttnn_input_tensor = ttnn.from_torch( torch_input_tensor, dtype=ttnn.bfloat16, @@ -95,6 +95,7 @@ def test_segformer_attention( reference_model.load_state_dict(sd) reference_model.eval() + torch_input_tensor = torch.reshape(torch_input_tensor, (batch_size, seq_len, hidden_size)) output = reference_model(torch_input_tensor, height, width) parameters = preprocess_model_parameters( @@ -116,6 +117,4 @@ def test_segformer_attention( if len(ttnn_final_output.shape) == 4: ttnn_final_output = ttnn_final_output[0] - assert_with_pcc( - output[0], ttnn_final_output, pcc=0.85 - ) # 0.97 to 0.85 due to adding parameters to linear and layernorm + assert_with_pcc(output[0], ttnn_final_output, pcc=0.99) diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py b/tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py index 6e898cdede7..61b0ee600f5 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_decode_head.py @@ -3,13 +3,14 @@ # SPDX-License-Identifier: Apache-2.0 import torch +import math import pytest import ttnn from tests.ttnn.utils_for_testing import assert_with_pcc from transformers import SegformerForSemanticSegmentation from ttnn.model_preprocessing import fold_batch_norm2d_into_conv2d, preprocess_model_parameters -from models.experimental.functional_segformer.reference.segformer_decode_head import SegformerDecodeHead -from models.experimental.functional_segformer.tt.ttnn_segformer_decode_head import TtSegformerDecodeHead +from models.demos.segformer.reference.segformer_decode_head import SegformerDecodeHead +from models.demos.segformer.tt.ttnn_segformer_decode_head import TtSegformerDecodeHead from tests.ttnn.integration_tests.segformer.test_segformer_mlp import ( create_custom_preprocessor as create_custom_preprocessor_mlp, ) @@ -58,45 +59,39 @@ def test_segformer_decode_head(device, is_ci_env): torch_input_tensor_2 = torch.randn(1, 160, 32, 32) torch_input_tensor_3 = torch.randn(1, 256, 16, 16) - if 0: - torch_input_tensor_0_folded = torch_input_tensor_0 - torch_input_tensor_1_folded = torch_input_tensor_1 - torch_input_tensor_2_folded = torch_input_tensor_2 - torch_input_tensor_3_folded = torch_input_tensor_3 - else: - torch_input_tensor_0_folded = torch.reshape(torch_input_tensor_0, (batch_size, 32, 128 * 128)) - torch_input_tensor_0_folded = torch.permute(torch_input_tensor_0_folded, (0, 2, 1)) - torch_input_tensor_1_folded = torch.reshape(torch_input_tensor_1, (batch_size, 64, 64 * 64)) - torch_input_tensor_1_folded = torch.permute(torch_input_tensor_1_folded, (0, 2, 1)) - torch_input_tensor_2_folded = torch.reshape(torch_input_tensor_2, (batch_size, 160, 32 * 32)) - torch_input_tensor_2_folded = torch.permute(torch_input_tensor_2_folded, (0, 2, 1)) - torch_input_tensor_3_folded = torch.reshape(torch_input_tensor_3, (batch_size, 256, 16 * 16)) - torch_input_tensor_3_folded = torch.permute(torch_input_tensor_3_folded, (0, 2, 1)) + torch_input_tensor_0_folded = torch.reshape(torch_input_tensor_0, (batch_size, 32, 128 * 128)) + torch_input_tensor_0_folded = torch.permute(torch_input_tensor_0_folded, (0, 2, 1)) + torch_input_tensor_1_folded = torch.reshape(torch_input_tensor_1, (batch_size, 64, 64 * 64)) + torch_input_tensor_1_folded = torch.permute(torch_input_tensor_1_folded, (0, 2, 1)) + torch_input_tensor_2_folded = torch.reshape(torch_input_tensor_2, (batch_size, 160, 32 * 32)) + torch_input_tensor_2_folded = torch.permute(torch_input_tensor_2_folded, (0, 2, 1)) + torch_input_tensor_3_folded = torch.reshape(torch_input_tensor_3, (batch_size, 256, 16 * 16)) + torch_input_tensor_3_folded = torch.permute(torch_input_tensor_3_folded, (0, 2, 1)) ttnn_input_tensor_0 = ttnn.from_torch( torch_input_tensor_0_folded, - dtype=ttnn.bfloat16, + dtype=ttnn.bfloat8_b, memory_config=ttnn.L1_MEMORY_CONFIG, device=device, layout=ttnn.TILE_LAYOUT, ) ttnn_input_tensor_1 = ttnn.from_torch( torch_input_tensor_1_folded, - dtype=ttnn.bfloat16, + dtype=ttnn.bfloat8_b, memory_config=ttnn.L1_MEMORY_CONFIG, device=device, layout=ttnn.TILE_LAYOUT, ) ttnn_input_tensor_2 = ttnn.from_torch( torch_input_tensor_2_folded, - dtype=ttnn.bfloat16, + dtype=ttnn.bfloat8_b, memory_config=ttnn.L1_MEMORY_CONFIG, device=device, layout=ttnn.TILE_LAYOUT, ) ttnn_input_tensor_3 = ttnn.from_torch( torch_input_tensor_3_folded, - dtype=ttnn.bfloat16, + dtype=ttnn.bfloat8_b, memory_config=ttnn.L1_MEMORY_CONFIG, device=device, layout=ttnn.TILE_LAYOUT, @@ -138,13 +133,12 @@ def test_segformer_decode_head(device, is_ci_env): ttnn_model = TtSegformerDecodeHead(config, parameters) ttnn_output = ttnn_model(ttnn_input_tensor, parameters) + ttnn.deallocate(ttnn_input_tensor_0) - ttnn_output = ttnn.from_device(ttnn_output) ttnn_output = ttnn.to_torch(ttnn_output) + ttnn_output = torch.permute(ttnn_output, (0, 3, 1, 2)) + h = w = int(math.sqrt(ttnn_output.shape[-1])) + ttnn_output = torch.reshape(ttnn_output, (ttnn_output.shape[0], ttnn_output.shape[1], h, w)) - # torch_output = torch.permute(torch_output,(0,3,2,1)) - # torch_output = torch.reshape(torch_output,(batch_size, 1, 16384, 256)) - - # print("ddd", torch_output.shape, ttnn_output.shape) - + print(torch_output.shape, ttnn_output.shape) assert_with_pcc(torch_output, ttnn_output, pcc=0.99) diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_dwconv.py b/tests/ttnn/integration_tests/segformer/test_segformer_dwconv.py index 02de4d21d23..188ac4ecdfd 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_dwconv.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_dwconv.py @@ -8,9 +8,9 @@ from tests.ttnn.utils_for_testing import assert_with_pcc from transformers import SegformerModel -from models.experimental.functional_segformer.reference.segformer_dwconv import SegformerDWConv +from models.demos.segformer.reference.segformer_dwconv import SegformerDWConv import pytest -from models.experimental.functional_segformer.tt.ttnn_segformer_dwconv import TtSegformerDWConv +from models.demos.segformer.tt.ttnn_segformer_dwconv import TtSegformerDWConv from models.utility_functions import skip_for_grayskull diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_efficient_selfattention.py b/tests/ttnn/integration_tests/segformer/test_segformer_efficient_selfattention.py index 0b00c9a3e2b..ac7bd074624 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_efficient_selfattention.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_efficient_selfattention.py @@ -13,11 +13,11 @@ from transformers import SegformerModel import pytest from tests.ttnn.utils_for_testing import assert_with_pcc -from models.experimental.functional_segformer.tt.ttnn_segformer_efficient_selfattention import ( +from models.demos.segformer.tt.ttnn_segformer_efficient_selfattention import ( TtSegformerEfficientSelfAttention, ) from models.utility_functions import skip_for_grayskull -from models.experimental.functional_segformer.reference.segformer_efficient_selfattention import ( +from models.demos.segformer.reference.segformer_efficient_selfattention import ( SegformerEfficientSelfAttention, ) @@ -48,10 +48,10 @@ def custom_preprocessor(model, name, ttnn_module_args): parameters["layer_norm"] = {} parameters["layer_norm"]["weight"] = preprocess_layernorm_parameter( - model.layer_norm.weight, dtype=ttnn.bfloat16 + model.layer_norm.weight, dtype=ttnn.bfloat8_b ) parameters["layer_norm"]["bias"] = preprocess_layernorm_parameter( - model.layer_norm.bias, dtype=ttnn.bfloat16 + model.layer_norm.bias, dtype=ttnn.bfloat8_b ) return parameters @@ -64,13 +64,13 @@ def custom_preprocessor(model, name, ttnn_module_args): "batch_size, seq_len, hidden_size, height, width, num_attention_heads, sequence_reduction_ratio, block_i, efficient_self_attention_i", [ (1, 16384, 32, 128, 128, 1, 8, 0, 0), - # (1, 16384, 32, 128, 128, 1, 8, 0, 1), + (1, 16384, 32, 128, 128, 1, 8, 0, 1), (1, 4096, 64, 64, 64, 2, 4, 1, 0), - # (1, 4096, 64, 64, 64, 2, 4, 1, 1), + (1, 4096, 64, 64, 64, 2, 4, 1, 1), (1, 1024, 160, 32, 32, 5, 2, 2, 0), - # (1, 1024, 160, 32, 32, 5, 2, 2, 1), + (1, 1024, 160, 32, 32, 5, 2, 2, 1), (1, 256, 256, 16, 16, 8, 1, 3, 0), - # (1, 256, 256, 16, 16, 8, 1, 3, 1), + (1, 256, 256, 16, 16, 8, 1, 3, 1), ], ) @pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True) @@ -91,7 +91,7 @@ def test_segformer_efficient_selfattention( if is_ci_env: pytest.skip("Skip in CI, model is WIP, issue# 13357") - torch_input_tensor = torch.randn(batch_size, seq_len, hidden_size) + torch_input_tensor = torch.randn(batch_size, 1, seq_len, hidden_size) ttnn_input_tensor = ttnn.from_torch( torch_input_tensor, dtype=ttnn.bfloat8_b, @@ -114,6 +114,7 @@ def test_segformer_efficient_selfattention( reference_model.load_state_dict(sd) reference_model.eval() + torch_input_tensor = torch.reshape(torch_input_tensor, (batch_size, seq_len, hidden_size)) torch_output = reference_model(torch_input_tensor, height, width) parameters = preprocess_model_parameters( @@ -133,6 +134,4 @@ def test_segformer_efficient_selfattention( ttnn_final_output = ttnn.to_torch(ttnn_output[0]) if len(ttnn_final_output.shape) == 4: ttnn_final_output = ttnn_final_output[0] - assert_with_pcc( - torch_output[0], ttnn_final_output, pcc=0.96 - ) # 0.98 to 0.96 due to adding parameters for linear and layernorm + assert_with_pcc(torch_output[0], ttnn_final_output, pcc=0.98) diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_encoder.py b/tests/ttnn/integration_tests/segformer/test_segformer_encoder.py index f4d1a2c5cba..bd315c59102 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_encoder.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_encoder.py @@ -21,10 +21,10 @@ from transformers import SegformerModel, SegformerConfig import pytest -from models.experimental.functional_segformer.tt.ttnn_segformer_encoder import ( +from models.demos.segformer.tt.ttnn_segformer_encoder import ( TtSegformerEncoder, ) -from models.experimental.functional_segformer.reference.segformer_encoder import SegformerEncoder +from models.demos.segformer.reference.segformer_encoder import SegformerEncoder def create_custom_preprocessor(device): @@ -52,10 +52,10 @@ def custom_preprocessor(model, name, ttnn_module_args): for i in range(4): parameters["layer_norm"][i] = {} parameters["layer_norm"][i]["weight"] = preprocess_layernorm_parameter( - model.layer_norm[i].weight, dtype=ttnn.bfloat16 + model.layer_norm[i].weight, dtype=ttnn.bfloat8_b ) parameters["layer_norm"][i]["bias"] = preprocess_layernorm_parameter( - model.layer_norm[i].bias, dtype=ttnn.bfloat16 + model.layer_norm[i].bias, dtype=ttnn.bfloat8_b ) return parameters @@ -93,13 +93,6 @@ def test_segformer_encoder(batch_size, num_channels, height, width, device, rese pytest.skip("Skip in CI, model is WIP, issue# 13357") torch_input_tensor = torch.randn(batch_size, num_channels, height, width) - ttnn_input_tensor = ttnn.from_torch( - torch_input_tensor, - dtype=ttnn.bfloat16, - memory_config=ttnn.L1_MEMORY_CONFIG, - device=device, - layout=ttnn.TILE_LAYOUT, - ) torch_model = SegformerModel.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") config = torch_model.config @@ -127,8 +120,47 @@ def test_segformer_encoder(batch_size, num_channels, height, width, device, rese ttnn_model = TtSegformerEncoder(config, parameters) + sharded_input_enabled = 0 + + if not sharded_input_enabled: + torch_input_tensor_permuted = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + ttnn_input_tensor = ttnn.from_torch( + torch_input_tensor_permuted, + dtype=ttnn.bfloat16, + memory_config=ttnn.L1_MEMORY_CONFIG, + device=device, + layout=ttnn.TILE_LAYOUT, + ) + else: + torch_input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + torch_input_tensor = torch.nn.functional.pad(torch_input_tensor, (0, 13, 0, 0, 0, 0, 0, 0)) + N, H, W, C = torch_input_tensor.shape + torch_input_tensor = torch.reshape(torch_input_tensor, (N, 1, H * W, C)) + + shard_grid = ttnn.CoreRangeSet( + { + ttnn.CoreRange( + ttnn.CoreCoord(0, 0), + ttnn.CoreCoord(7, 7), + ), + } + ) + n_cores = 64 + shard_spec = ttnn.ShardSpec(shard_grid, [N * H * W // n_cores, C], ttnn.ShardOrientation.ROW_MAJOR, False) + input_mem_config = ttnn.MemoryConfig( + ttnn.types.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.types.BufferType.L1, shard_spec + ) + ttnn_input_tensor = ttnn.from_torch( + torch_input_tensor, + dtype=ttnn.bfloat16, + layout=ttnn.ROW_MAJOR_LAYOUT, + device=device, + memory_config=input_mem_config, + ) + ttnn_output = ttnn_model(ttnn_input_tensor, parameters=parameters) ttnn_final_output = ttnn.to_torch(ttnn_output.last_hidden_state) + torch_final_output = torch.permute(torch_output.last_hidden_state, (0, 2, 3, 1)) - assert_with_pcc(torch_output.last_hidden_state, ttnn_final_output, pcc=0.885) + assert_with_pcc(torch_final_output, ttnn_final_output, pcc=0.929) diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py b/tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py index 849ed588e11..34b1b7d7ea8 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_for_semantic_segmentation.py @@ -5,16 +5,18 @@ import pytest import ttnn from PIL import Image +import torch +import math import requests from tests.ttnn.utils_for_testing import assert_with_pcc from ttnn.model_preprocessing import preprocess_model_parameters, ParameterDict, ParameterList -from models.experimental.functional_segformer.tt.ttnn_segformer_for_semantic_segmentation import ( +from models.demos.segformer.tt.ttnn_segformer_for_semantic_segmentation import ( TtSegformerForSemanticSegmentation, ) from datasets import load_dataset from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor -from models.experimental.functional_segformer.reference.segformer_for_semantic_segmentation import ( +from models.demos.segformer.reference.segformer_for_semantic_segmentation import ( SegformerForSemanticSegmentationReference, ) from tests.ttnn.integration_tests.segformer.test_segformer_model import ( @@ -78,14 +80,6 @@ def test_segformer_for_semantic_segmentation(device, is_ci_env): state_dict = torch_model.state_dict() inputs = processor(images=image, return_tensors="pt") - ttnn_input_tensor = ttnn.from_torch( - inputs.pixel_values, - dtype=ttnn.bfloat16, - memory_config=ttnn.L1_MEMORY_CONFIG, - device=device, - layout=ttnn.TILE_LAYOUT, - ) - new_state_dict = {} keys = [name for name, parameter in reference_model.state_dict().items()] values = [parameter for name, parameter in state_dict.items()] @@ -112,6 +106,15 @@ def test_segformer_for_semantic_segmentation(device, is_ci_env): ttnn_model = TtSegformerForSemanticSegmentation(config, parameters) + torch_input_tensor_permuted = torch.permute(inputs.pixel_values, (0, 2, 3, 1)) + ttnn_input_tensor = ttnn.from_torch( + torch_input_tensor_permuted, + dtype=ttnn.bfloat16, + memory_config=ttnn.L1_MEMORY_CONFIG, + device=device, + layout=ttnn.TILE_LAYOUT, + ) + ttnn_output = ttnn_model( ttnn_input_tensor, output_attentions=None, @@ -119,6 +122,10 @@ def test_segformer_for_semantic_segmentation(device, is_ci_env): return_dict=None, parameters=parameters, ) - ttnn_final_output = ttnn.to_torch(ttnn_output.logits) - assert_with_pcc(torch_output.logits, ttnn_final_output, pcc=0.99) + ttnn_output = ttnn.to_torch(ttnn_output.logits) + ttnn_output = torch.permute(ttnn_output, (0, 3, 1, 2)) + h = w = int(math.sqrt(ttnn_output.shape[-1])) + ttnn_final_output = torch.reshape(ttnn_output, (ttnn_output.shape[0], ttnn_output.shape[1], h, w)) + + assert_with_pcc(torch_output.logits, ttnn_final_output, pcc=0.985) diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_layer.py b/tests/ttnn/integration_tests/segformer/test_segformer_layer.py index 1377cb413a3..a685a77e59d 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_layer.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_layer.py @@ -14,11 +14,11 @@ from transformers import SegformerModel import pytest -from models.experimental.functional_segformer.tt.ttnn_segformer_layer import ( +from models.demos.segformer.tt.ttnn_segformer_layer import ( TtSegformerLayer, ) -from models.experimental.functional_segformer.reference.segformer_layer import SegformerLayer +from models.demos.segformer.reference.segformer_layer import SegformerLayer from tests.ttnn.integration_tests.segformer.test_segformer_mix_ffn import ( create_custom_preprocessor as create_custom_preprocessor_mix_ffn, ) @@ -34,10 +34,10 @@ def custom_preprocessor(model, name, ttnn_module_args): if isinstance(model, SegformerLayer): parameters["layer_norm_1"] = {} parameters["layer_norm_1"]["weight"] = preprocess_layernorm_parameter( - model.layer_norm_1.weight, dtype=ttnn.bfloat16 + model.layer_norm_1.weight, dtype=ttnn.bfloat8_b ) parameters["layer_norm_1"]["bias"] = preprocess_layernorm_parameter( - model.layer_norm_1.bias, dtype=ttnn.bfloat16 + model.layer_norm_1.bias, dtype=ttnn.bfloat8_b ) attention_preprocess = create_custom_preprocessor_attention(device) @@ -50,10 +50,10 @@ def custom_preprocessor(model, name, ttnn_module_args): parameters["layer_norm_2"] = {} parameters["layer_norm_2"]["weight"] = preprocess_layernorm_parameter( - model.layer_norm_2.weight, dtype=ttnn.bfloat16 + model.layer_norm_2.weight, dtype=ttnn.bfloat8_b ) parameters["layer_norm_2"]["bias"] = preprocess_layernorm_parameter( - model.layer_norm_2.bias, dtype=ttnn.bfloat16 + model.layer_norm_2.bias, dtype=ttnn.bfloat8_b ) return parameters @@ -111,7 +111,7 @@ def test_segformer_layer( if is_ci_env: pytest.skip("Skip in CI, model is WIP, issue# 13357") - torch_input_tensor = torch.randn(batch_size, seq_len, hidden_size) + torch_input_tensor = torch.randn(batch_size, 1, seq_len, hidden_size) ttnn_input_tensor = ttnn.from_torch( torch_input_tensor, dtype=ttnn.bfloat16, @@ -136,6 +136,7 @@ def test_segformer_layer( reference_model.load_state_dict(sd) reference_model.eval() + torch_input_tensor = torch.reshape(torch_input_tensor, (batch_size, seq_len, hidden_size)) torch_output = reference_model(torch_input_tensor, height=height, width=width) parameters = preprocess_model_parameters( @@ -157,4 +158,4 @@ def test_segformer_layer( if len(ttnn_final_output.shape) == 4: ttnn_final_output = ttnn_final_output[0] - assert_with_pcc(torch_output[0], ttnn_final_output, pcc=0.94) + assert_with_pcc(torch_output[0], ttnn_final_output, pcc=0.99) diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_mix_ffn.py b/tests/ttnn/integration_tests/segformer/test_segformer_mix_ffn.py index 80b03d7a736..ac2fbd79692 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_mix_ffn.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_mix_ffn.py @@ -13,8 +13,8 @@ from transformers import SegformerModel import pytest -from models.experimental.functional_segformer.tt.ttnn_segformer_mix_ffn import TtSegformerMixFFN -from models.experimental.functional_segformer.reference.segformer_mixffn import SegformerMixFFN +from models.demos.segformer.tt.ttnn_segformer_mix_ffn import TtSegformerMixFFN +from models.demos.segformer.reference.segformer_mixffn import SegformerMixFFN from tests.ttnn.integration_tests.segformer.test_segformer_dwconv import ( create_custom_preprocessor as create_custom_preprocessor_dwconv, ) @@ -111,4 +111,4 @@ def test_segformer_mix_ffn( ttnn_output = ttnn.from_device(ttnn_output) ttnn_output = ttnn.to_torch(ttnn_output)[0] - assert_with_pcc(torch_output, ttnn_output, pcc=0.96) + assert_with_pcc(torch_output, ttnn_output, pcc=0.99) diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_mlp.py b/tests/ttnn/integration_tests/segformer/test_segformer_mlp.py index 80ad040fb03..ae9101601f3 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_mlp.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_mlp.py @@ -7,11 +7,11 @@ import torch from tests.ttnn.utils_for_testing import assert_with_pcc from ttnn.model_preprocessing import preprocess_model_parameters, preprocess_linear_weight, preprocess_linear_bias -from models.experimental.functional_segformer.tt.ttnn_segformer_mlp import ( +from models.demos.segformer.tt.ttnn_segformer_mlp import ( TtSegformerMLP, ) -from models.experimental.functional_segformer.reference.segformer_mlp import ( +from models.demos.segformer.reference.segformer_mlp import ( SegformerMLP, ) from transformers import SegformerForSemanticSegmentation diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_model.py b/tests/ttnn/integration_tests/segformer/test_segformer_model.py index bccf5dfd7cf..c10e2ac8ba8 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_model.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_model.py @@ -12,10 +12,10 @@ from transformers import SegformerModel import pytest -from models.experimental.functional_segformer.tt.ttnn_segformer_model import ( +from models.demos.segformer.tt.ttnn_segformer_model import ( TtSegformerModel, ) -from models.experimental.functional_segformer.reference.segformer_model import SegformerModelReference +from models.demos.segformer.reference.segformer_model import SegformerModelReference from models.utility_functions import skip_for_grayskull @@ -70,13 +70,7 @@ def test_segformer_model( pytest.skip("Skip in CI, model is WIP, issue# 13357") torch_input_tensor = torch.randn(batch_size, num_channels, height, width) - ttnn_input_tensor = ttnn.from_torch( - torch_input_tensor, - dtype=ttnn.bfloat16, - memory_config=ttnn.L1_MEMORY_CONFIG, - device=device, - layout=ttnn.TILE_LAYOUT, - ) + torch_model = SegformerModel.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") config = torch_model.config @@ -102,6 +96,15 @@ def test_segformer_model( ttnn_model = TtSegformerModel(config, parameters) + torch_input_tensor_permuted = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + ttnn_input_tensor = ttnn.from_torch( + torch_input_tensor_permuted, + dtype=ttnn.bfloat16, + memory_config=ttnn.L1_MEMORY_CONFIG, + device=device, + layout=ttnn.TILE_LAYOUT, + ) + ttnn_output = ttnn_model( ttnn_input_tensor, output_attentions=None, @@ -110,5 +113,6 @@ def test_segformer_model( parameters=parameters, ) ttnn_final_output = ttnn.to_torch(ttnn_output[0]) + torch_final_output = torch.permute(torch_output.last_hidden_state, (0, 2, 3, 1)) - assert_with_pcc(torch_output[0], ttnn_final_output, pcc=0.885) + assert_with_pcc(torch_final_output, ttnn_final_output, pcc=0.929) diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_overlap_path_embeddings.py b/tests/ttnn/integration_tests/segformer/test_segformer_overlap_path_embeddings.py index fd080e1f8d7..6f6d22ca9d2 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_overlap_path_embeddings.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_overlap_path_embeddings.py @@ -11,11 +11,11 @@ from tests.ttnn.utils_for_testing import assert_with_pcc from transformers import SegformerModel import pytest -from models.experimental.functional_segformer.tt.ttnn_segformer_overlap_patch_embeddings import ( +from models.demos.segformer.tt.ttnn_segformer_overlap_patch_embeddings import ( TtSegformerOverlapPatchEmbeddings, ) -from models.experimental.functional_segformer.reference.segformer_overlap_patch_embeddings import ( +from models.demos.segformer.reference.segformer_overlap_patch_embeddings import ( SegformerOverlapPatchEmbeddings, ) from models.utility_functions import skip_for_grayskull @@ -34,10 +34,10 @@ def custom_preprocessor(model, name, ttnn_module_args): parameters["layer_norm"] = {} parameters["layer_norm"]["weight"] = preprocess_layernorm_parameter( - model.layer_norm.weight, dtype=ttnn.bfloat16 + model.layer_norm.weight, dtype=ttnn.bfloat8_b ) parameters["layer_norm"]["bias"] = preprocess_layernorm_parameter( - model.layer_norm.bias, dtype=ttnn.bfloat16 + model.layer_norm.bias, dtype=ttnn.bfloat8_b ) return parameters @@ -73,22 +73,6 @@ def test_segformer_overlap_patch_embeddings( pytest.skip("Skip in CI, model is WIP, issue# 13357") torch_input_tensor = torch.randn(batch_size, num_channels, height, width) - if width == 512: - ttnn_input_tensor = ttnn.from_torch( - torch_input_tensor, - dtype=ttnn.bfloat16, - memory_config=ttnn.L1_MEMORY_CONFIG, - device=device, - layout=ttnn.TILE_LAYOUT, - ) - else: - ttnn_input_tensor = ttnn.from_torch( - torch_input_tensor, - dtype=ttnn.bfloat8_b, - memory_config=ttnn.L1_MEMORY_CONFIG, - device=device, - layout=ttnn.TILE_LAYOUT, - ) torch_model = SegformerModel.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") @@ -114,6 +98,27 @@ def test_segformer_overlap_patch_embeddings( stride=stride, ) + post_process_it = 0 + if width == 512: + torch_input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + ttnn_input_tensor = ttnn.from_torch( + torch_input_tensor, + dtype=ttnn.bfloat16, + memory_config=ttnn.L1_MEMORY_CONFIG, + device=device, + layout=ttnn.TILE_LAYOUT, + ) + else: + torch_input_tensor = torch.permute(torch_input_tensor, (0, 2, 3, 1)) + ttnn_input_tensor = ttnn.from_torch( + torch_input_tensor, + dtype=ttnn.bfloat16, + memory_config=ttnn.L1_MEMORY_CONFIG, + device=device, + layout=ttnn.ROW_MAJOR_LAYOUT, + ) + post_process_it = 1 + ttnn_output, height, width = ttnn_model( ttnn_input_tensor, parameters=parameters, diff --git a/tests/ttnn/integration_tests/segformer/test_segformer_selfoutput.py b/tests/ttnn/integration_tests/segformer/test_segformer_selfoutput.py index 525795c8c0f..bb0e766fcc2 100644 --- a/tests/ttnn/integration_tests/segformer/test_segformer_selfoutput.py +++ b/tests/ttnn/integration_tests/segformer/test_segformer_selfoutput.py @@ -12,8 +12,8 @@ from tests.ttnn.utils_for_testing import assert_with_pcc from transformers import SegformerModel import pytest -from models.experimental.functional_segformer.tt.ttnn_segformer_selfoutput import TtSegformerSelfOutput -from models.experimental.functional_segformer.reference.segformer_selfoutput import SegformerSelfOutput +from models.demos.segformer.tt.ttnn_segformer_selfoutput import TtSegformerSelfOutput +from models.demos.segformer.reference.segformer_selfoutput import SegformerSelfOutput from models.utility_functions import skip_for_grayskull diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py index 00be4435617..31f4636aa66 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py @@ -12,7 +12,7 @@ def perf_report(file_path): df = df.dropna(subset=["DEVICE ERISC KERNEL DURATION [ns]"]) df = df[df["OP TO OP LATENCY [ns]"] != 0] - df = df[df["TRACE ID"].notna() & (df["TRACE ID"] != "")] + df = df[df["METAL TRACE ID"].notna() & (df["METAL TRACE ID"] != "")] def remove_keys_from_attributes(attributes): attributes = attributes.replace(";", ",").replace("'", '"') @@ -56,7 +56,9 @@ def safe_parse_attributes(attributes): ) df["dim"] = df["ATTRIBUTES"].apply( - lambda x: safe_parse_attributes(x).get("dim", "") if isinstance(safe_parse_attributes(x), dict) else "" + lambda x: safe_parse_attributes(x).get("dim", safe_parse_attributes(x).get("scatter_dim", "")) + if isinstance(safe_parse_attributes(x), dict) + else "" ) df["num_links"] = df["ATTRIBUTES"].apply( @@ -154,15 +156,15 @@ def calculate_bandwidth(row): op_bw = (output_tensor_volume * (n_chips - 1) / n_chips) / longest_device_fw_time link_bw = (output_tensor_volume * (n_chips - 1) / n_chips) / longest_erisc_fw_time elif row["OP CODE"] == "ReduceScatter": - op_bw = (input_tensor_volume / n_chips) / longest_device_fw_time - link_bw = (input_tensor_volume * (n_chips - 1) / n_chips) / longest_erisc_fw_time + op_bw = input_tensor_volume / longest_device_fw_time + link_bw = input_tensor_volume / longest_erisc_fw_time elif row["topology"] == "Linear": if row["OP CODE"] == "AllGather": op_bw = input_tensor_volume * n_chips / longest_device_fw_time link_bw = input_tensor_volume * (n_chips - 1) / longest_erisc_fw_time elif row["OP CODE"] == "ReduceScatter": op_bw = input_tensor_volume / longest_device_fw_time - link_bw = input_tensor_volume * (n_chips - 1) / n_chips / longest_erisc_fw_time + link_bw = input_tensor_volume / longest_erisc_fw_time return round(op_bw, 2), round(link_bw, 2) for i, (group, group_df) in enumerate(grouped, start=1): @@ -194,13 +196,17 @@ def calculate_bandwidth(row): "output_mem_config": group_df["output_mem_config"].iloc[0] if "output_mem_config" in group_df else "", "topology": group_df["topology"].iloc[0], "Layout": group_df["Layout"].iloc[0] if "Layout" in group_df else "", + "Data Type": group_df["Data Type"].iloc[0] if "Data Type" in group_df else "", } for column in numeric_columns: min_val = round(group_df[column].min(), 2) largest_vals = group_df[column].nlargest(3) max_val = round(largest_vals.iloc[-1], 2) - avg_val = round(group_df[column][~group_df[column].isin(largest_vals.head(2))].mean(), 2) + if min_val == max_val: + avg_val = min_val + else: + avg_val = round(group_df[column][~group_df[column].isin(largest_vals.head(2))].mean(), 2) group_data[column] = f"{min_val} - {avg_val} - {max_val}" diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh similarity index 100% rename from tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh rename to tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh new file mode 100755 index 00000000000..23071225ac1 --- /dev/null +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh @@ -0,0 +1,97 @@ +#!/bin/sh +MODULE_DIR="tests/ttnn/unit_tests/operations/ccl/perf" + +# Defaults +DEBUG=false +TARGET="n300" + +# Function to display help +show_help() { + echo "Usage: ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh [OPTIONS]" + echo + echo "Options:" + echo " -d, --debug Enable debug mode to show real-time output." + echo " -t, --target Specify the target configuration (t3000 or n300). Default is n300." + echo " -h, --help Display this help message." + echo + echo "Example:" + echo " ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh --debug --target n300" + echo " ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -h" +} + +# Parse command-line arguments +while [ $# -gt 0 ]; do + case "$1" in + --debug|-d) + DEBUG=true + shift + ;; + --help|-h) + show_help + exit 0 + ;; + --target|-t) + # Ensure there is an argument following the target flag + if [ -z "$2" ]; then + echo "Error: No target specified after $1." + show_help + exit 1 + fi + + TARGET="$2" # Set the target configuration + shift 2 + + # Validate the target value + if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "n300" ]; then + echo "Error: Invalid target configuration: $TARGET. Must be either 't3000' or 'n300'." + exit 1 + fi + ;; + *) + echo "Unknown option: $1" + show_help + exit 1 + ;; + esac +done + +# Function to run the profiling command and extract the CSV path +run_profile_and_extract_csv() { + command="./tt_metal/tools/profiler/profile_this.py -n reduce_scatter_$TARGET -c 'pytest tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py::test_reduce_scatter_on_$TARGET'" + + if [ "$DEBUG" = true ]; then + echo "Running profiling command for target $TARGET in debug mode..." + full_output=$(eval $command 2>&1 | tee /dev/tty) + else + echo "Running profiling command for target $TARGET..." + full_output=$(eval $command 2>&1) + fi + + # Extract the CSV path + csv_path=$(echo "$full_output" | grep -oE 'OPs csv generated at: (.+\.csv)' | sed -E 's/OPs csv generated at: //') + + if [ -n "$csv_path" ]; then + echo "CSV path found: $csv_path" + + # Run the Python script to generate performance report + average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c " +import pandas as pd +from perf_csv import perf_report +from tabulate import tabulate + +# Generate the report and convert it to a DataFrame +average_df = perf_report('$csv_path') +# Print the DataFrame in a pretty table format +print(tabulate(average_df, headers='keys', tablefmt='pretty')) +") + + # Print the output + echo "Min - Avg - Max by Common Runs:" + echo "$average_values" + else + echo "CSV path not found in the command output." + fi +} + +# Run the function +run_profile_and_extract_csv diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py index c9a6c90ef31..1429eb0fce1 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py @@ -9,6 +9,9 @@ run_all_gather_on_n300_impl, run_all_gather_on_t3000_impl_tight_loop, ) +from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_post_commit import ( + run_reduce_scatter_test, +) @skip_for_grayskull("Requires eth connected devices to run") @@ -128,3 +131,138 @@ def test_all_gather_on_t3000( enable_async=enable_async, trace_mode=True, ) + + +@skip_for_grayskull("Requires eth connected devices to run") +@pytest.mark.parametrize( + "num_devices, num_links", + [ + (8, 1), + ], +) +@pytest.mark.parametrize( + "per_chip_output_shape, scatter_dim, layout", + [ + ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT), + ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT), + ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 32 * 8], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 64 * 8], 3, ttnn.TILE_LAYOUT), + ], +) +@pytest.mark.parametrize( + "input_dtype", + [ + ttnn.bfloat16, + ], +) +@pytest.mark.parametrize( + "mem_config", + [ + ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM), + ], +) +@pytest.mark.parametrize("num_iters", [20]) +@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) +@pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("topology", [ttnn.Topology.Linear, ttnn.Topology.Ring]) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True) +def test_reduce_scatter_on_t3000( + t3k_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + enable_async, + num_iters, + topology, +): + run_reduce_scatter_test( + t3k_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + num_iters=num_iters, + enable_async=enable_async, + topology=topology, + trace_mode=True, + ) + + +@skip_for_grayskull("Requires eth connected devices to run") +@pytest.mark.parametrize( + "num_devices, num_links", + [ + (2, 1), + ], +) +@pytest.mark.parametrize( + "per_chip_output_shape, scatter_dim, layout", + [ + ([1, 1, 32, 4096], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 1024], 3, ttnn.TILE_LAYOUT), + ], +) +@pytest.mark.parametrize( + "input_dtype", + [ + ttnn.bfloat16, + ttnn.bfloat8_b, + ], +) +@pytest.mark.parametrize( + "mem_config", + [ + ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM), + ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1), + ], +) +@pytest.mark.parametrize("num_iters", [20]) +@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) +@pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True) +def test_reduce_scatter_on_n300( + n300_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + enable_async, + num_iters, +): + run_reduce_scatter_test( + n300_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + num_iters=num_iters, + enable_async=enable_async, + trace_mode=True, + ) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py index 9fbc710ed7c..916682dd84e 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py @@ -32,9 +32,10 @@ def run_with_trace( num_links, math_op, output_mem_config, - n_worker, - n_buffer, - num_iters, + n_worker=None, + n_buffer=None, + num_iters=40, + topology=ttnn.Topology.Ring, ): # Compile Run logger.info("Compiling model") @@ -46,6 +47,7 @@ def run_with_trace( memory_config=output_mem_config, num_workers=n_worker, num_buffers_per_channel=n_buffer, + topology=topology, ) for device_id in t3k_mesh_device.get_device_ids(): ttnn.synchronize_device(t3k_mesh_device.get_device(device_id)) @@ -62,6 +64,7 @@ def run_with_trace( memory_config=output_mem_config, num_workers=n_worker, num_buffers_per_channel=n_buffer, + topology=topology, ) ttnn.end_trace_capture(t3k_mesh_device, trace_id, cq_id=0) for device_id in t3k_mesh_device.get_device_ids(): @@ -92,6 +95,7 @@ def run_reduce_scatter_test( enable_async=True, num_iters=1, topology=ttnn.Topology.Ring, + trace_mode=False, ): if len(mesh_device.get_device_ids()) < num_devices: pytest.skip( @@ -135,19 +139,31 @@ def run_reduce_scatter_test( input_tensor_mesh = ttnn.aggregate_as_tensor(tt_input_tensors) # Run the op - for i in range(num_iters): - output_tensor_mesh = ttnn.reduce_scatter( + if trace_mode: + output_tensor_mesh = run_with_trace( + mesh_device, input_tensor_mesh, - scatter_dim=scatter_dim, - math_op=math_op, - num_links=num_links, - memory_config=mem_config, + scatter_dim, + num_links, + math_op, + mem_config, + num_iters=num_iters, topology=topology, ) + else: + for i in range(num_iters): + output_tensor_mesh = ttnn.reduce_scatter( + input_tensor_mesh, + scatter_dim=scatter_dim, + math_op=math_op, + num_links=num_links, + memory_config=mem_config, + topology=topology, + ) - for device_id in mesh_device.get_device_ids(): - ttnn.synchronize_device(mesh_device.get_device(device_id)) - logger.info(f"Done iteration {i}") + for device_id in mesh_device.get_device_ids(): + ttnn.synchronize_device(mesh_device.get_device(device_id)) + logger.info(f"Done iteration {i}") # ttnn.visualize_mesh_device(t3k_mesh_device, tensor=output_tensor_mesh) # Compute golden diff --git a/tt_metal/hw/inc/blackhole/eth_interface.h b/tt_metal/hw/inc/blackhole/eth_interface.h deleted file mode 100644 index 676ba847483..00000000000 --- a/tt_metal/hw/inc/blackhole/eth_interface.h +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "tt_metal/third_party/umd/src/firmware/riscv/blackhole/eth_interface.h" diff --git a/tt_metal/hw/inc/grayskull/eth_interface.h b/tt_metal/hw/inc/grayskull/eth_interface.h deleted file mode 100644 index 1f965f087db..00000000000 --- a/tt_metal/hw/inc/grayskull/eth_interface.h +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -const uint32_t ETH_RACK_COORD_WIDTH = 0; -const uint32_t CMD_BUF_SIZE_MASK = 0; -const uint32_t MAX_BLOCK_SIZE = 0; -const uint32_t REQUEST_CMD_QUEUE_BASE = 0; -const uint32_t RESPONSE_CMD_QUEUE_BASE = 0; -const uint32_t CMD_COUNTERS_SIZE_BYTES = 0; -const uint32_t REMOTE_UPDATE_PTR_SIZE_BYTES = 0; -const uint32_t CMD_DATA_BLOCK = 0; -const uint32_t CMD_WR_REQ = 0; -const uint32_t CMD_WR_ACK = 0; -const uint32_t CMD_RD_REQ = 0; -const uint32_t CMD_RD_DATA = 0; -const uint32_t CMD_BUF_SIZE = 0; -const uint32_t CMD_DATA_BLOCK_DRAM = 0; -const uint32_t ETH_ROUTING_DATA_BUFFER_ADDR = 0; -const uint32_t REQUEST_ROUTING_CMD_QUEUE_BASE = 0; -const uint32_t RESPONSE_ROUTING_CMD_QUEUE_BASE = 0; -const uint32_t CMD_BUF_PTR_MASK = 0; diff --git a/tt_metal/hw/inc/wormhole/eth_interface.h b/tt_metal/hw/inc/wormhole/eth_interface.h deleted file mode 100644 index 53285a9529e..00000000000 --- a/tt_metal/hw/inc/wormhole/eth_interface.h +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "tt_metal/third_party/umd/src/firmware/riscv/wormhole/eth_interface.h" diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 8098a0cc5f1..17cacd5e51a 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -2378,9 +2378,17 @@ void HWCommandQueue::enqueue_trace(const uint32_t trace_id, bool blocking) { // Increment the expected worker cores counter due to trace programs completion this->expected_num_workers_completed += trace_inst->desc->num_completion_worker_cores; // After trace runs, the rdptr on each worker will be incremented by the number of programs in the trace - // Update the wptr on host to match state - this->device->worker_launch_message_buffer_state.set_mcast_wptr(trace_inst->desc->num_traced_programs_needing_go_signal_multicast); - this->device->worker_launch_message_buffer_state.set_unicast_wptr(trace_inst->desc->num_traced_programs_needing_go_signal_unicast); + // Update the wptr on host to match state. If the trace doesn't execute on a + // class of worker (unicast or multicast), it doesn't reset or modify the + // state for those workers. + if (trace_inst->desc->num_traced_programs_needing_go_signal_multicast) { + this->device->worker_launch_message_buffer_state.set_mcast_wptr( + trace_inst->desc->num_traced_programs_needing_go_signal_multicast); + } + if (trace_inst->desc->num_traced_programs_needing_go_signal_unicast) { + this->device->worker_launch_message_buffer_state.set_unicast_wptr( + trace_inst->desc->num_traced_programs_needing_go_signal_unicast); + } // The config buffer manager is unaware of what memory is used inside the trace, so mark all memory as used so that // it will force a stall and avoid stomping on in-use state. // TODO(jbauman): Reuse old state from the trace. diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index 05a19ea4d47..04ff630f1f3 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -170,11 +170,11 @@ struct dispatch_constants { if (dev_addr_type == CommandQueueDeviceAddrType::PREFETCH_Q_RD) { device_cq_addr_sizes_[dev_addr_idx] = sizeof(uint32_t); } else if (dev_addr_type == CommandQueueDeviceAddrType::PREFETCH_Q_PCIE_RD) { - device_cq_addr_sizes_[dev_addr_idx] = L1_ALIGNMENT - sizeof(uint32_t); + device_cq_addr_sizes_[dev_addr_idx] = l1_alignment - sizeof(uint32_t); } else if (dev_addr_type == CommandQueueDeviceAddrType::DISPATCH_MESSAGE) { - device_cq_addr_sizes_[dev_addr_idx] = 32; // Should this be 2x L1_ALIGNMENT? + device_cq_addr_sizes_[dev_addr_idx] = 32; // Should this be 2x l1_alignment? } else { - device_cq_addr_sizes_[dev_addr_idx] = L1_ALIGNMENT; + device_cq_addr_sizes_[dev_addr_idx] = l1_alignment; } } diff --git a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp index 763e6567b01..7e17ccfaca0 100644 --- a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp @@ -6,6 +6,8 @@ #define COMPILE_FOR_IDLE_ERISC +#include + #include "llrt/hal.hpp" #include "llrt/blackhole/bh_hal.hpp" #include "hw/inc/blackhole/core_config.h" @@ -29,7 +31,7 @@ HalCoreInfoType create_active_eth_mem_map() { std::vector mem_map_bases; mem_map_bases.resize(utils::underlying_type(HalL1MemAddrType::COUNT)); - mem_map_bases[utils::underlying_type(HalL1MemAddrType::BARRIER)] = MEM_L1_BARRIER; + mem_map_bases[utils::underlying_type(HalL1MemAddrType::BARRIER)] = eth_l1_mem::address_map::ERISC_BARRIER_BASE; mem_map_bases[utils::underlying_type(HalL1MemAddrType::MAILBOX)] = eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE; mem_map_bases[utils::underlying_type(HalL1MemAddrType::LAUNCH)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[utils::underlying_type(HalL1MemAddrType::WATCHER)] = GET_ETH_MAILBOX_ADDRESS_HOST(watcher); @@ -40,10 +42,11 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_bases[utils::underlying_type(HalL1MemAddrType::CORE_INFO)] = GET_ETH_MAILBOX_ADDRESS_HOST(core_info); mem_map_bases[utils::underlying_type(HalL1MemAddrType::GO_MSG)] = GET_ETH_MAILBOX_ADDRESS_HOST(go_message); mem_map_bases[utils::underlying_type(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); + mem_map_bases[utils::underlying_type(HalL1MemAddrType::FW_VERSION_ADDR)] = eth_l1_mem::address_map::FW_VERSION_ADDR; std::vector mem_map_sizes; mem_map_sizes.resize(utils::underlying_type(HalL1MemAddrType::COUNT)); - mem_map_sizes[utils::underlying_type(HalL1MemAddrType::BARRIER)] = sizeof(uint32_t); + mem_map_sizes[utils::underlying_type(HalL1MemAddrType::BARRIER)] = eth_l1_mem::address_map::ERISC_BARRIER_SIZE; mem_map_sizes[utils::underlying_type(HalL1MemAddrType::MAILBOX)] = eth_l1_mem::address_map::ERISC_MEM_MAILBOX_SIZE; mem_map_sizes[utils::underlying_type(HalL1MemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[utils::underlying_type(HalL1MemAddrType::WATCHER)] = sizeof(watcher_msg_t); @@ -53,6 +56,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[utils::underlying_type(HalL1MemAddrType::UNRESERVED)] = eth_l1_mem::address_map::MAX_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; mem_map_sizes[utils::underlying_type(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[utils::underlying_type(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t); + mem_map_sizes[utils::underlying_type(HalL1MemAddrType::FW_VERSION_ADDR)] = sizeof(std::uint32_t); // TODO (abhullar): This should be NumEthDispatchClasses std::vector> processor_classes(1); diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/llrt/hal.hpp index 875252ca8da..1ba7a104e84 100644 --- a/tt_metal/llrt/hal.hpp +++ b/tt_metal/llrt/hal.hpp @@ -50,6 +50,7 @@ enum class HalL1MemAddrType : uint8_t { CORE_INFO, GO_MSG, LAUNCH_MSG_BUFFER_RD_PTR, + FW_VERSION_ADDR, // Really only applicable to active eth core right now COUNT // Keep this last so it always indicates number of enum options }; diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 265e928514b..715ab3c974e 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -271,7 +271,6 @@ void Cluster::open_driver(const bool &skip_driver_allocs) { device_driver->configure_active_ethernet_cores_for_mmio_device(mmio_device_id, {}); } } - device_driver->set_driver_eth_interface_params(eth_interface_params); // Adding this check is a workaround for current UMD bug that only uses this getter to populate private metadata // that is later expected to be populated by unrelated APIs @@ -281,6 +280,12 @@ void Cluster::open_driver(const bool &skip_driver_allocs) { } std::uint32_t dram_barrier_base = tt_metal::hal.get_dev_addr(tt_metal::HalDramMemAddrType::DRAM_BARRIER); device_driver->set_device_dram_address_params(tt_device_dram_address_params{dram_barrier_base}); + + l1_address_params.tensix_l1_barrier_base = tt_metal::hal.get_dev_addr(tt_metal::HalProgrammableCoreType::TENSIX, tt_metal::HalL1MemAddrType::BARRIER); + if (tt_metal::hal.get_arch() != tt::ARCH::GRAYSKULL) { + l1_address_params.eth_l1_barrier_base = tt_metal::hal.get_dev_addr(tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt_metal::HalL1MemAddrType::BARRIER); + l1_address_params.fw_version_addr = tt_metal::hal.get_dev_addr(tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt_metal::HalL1MemAddrType::FW_VERSION_ADDR); + } device_driver->set_device_l1_address_params(l1_address_params); this->get_metal_desc_from_tt_desc( @@ -344,18 +349,6 @@ uint32_t Cluster::get_harvested_rows(chip_id_t chip) const { } } -void Cluster::verify_eth_fw() const { - for (const auto &[chip, mmio_device_id] : this->device_to_mmio_device_) { - std::vector fw_versions; - for (const CoreCoord ð_core : get_soc_desc(chip).ethernet_cores) { - uint32_t val; - read_core(&val, sizeof(uint32_t), tt_cxy_pair(chip, eth_core), eth_l1_mem::address_map::FW_VERSION_ADDR); - fw_versions.push_back(val); - } - verify_sw_fw_versions(chip, SW_VERSION, fw_versions); - } -} - int Cluster::get_device_aiclk(const chip_id_t &chip_id) const { if (this->arch_ == tt::ARCH::BLACKHOLE) { // For Blackhole bring up remove AICLK query due to lack of ARC message support diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 26fc0409d71..f59bf5d42e9 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -17,7 +17,6 @@ // clang-format off #include "noc/noc_parameters.h" -#include "eth_interface.h" #include "eth_l1_address_map.h" #include "dev_msgs.h" // clang-format on @@ -72,7 +71,6 @@ class Cluster { } //! device driver and misc apis - void verify_eth_fw() const; void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions) const; void deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const; @@ -285,39 +283,7 @@ class Cluster { // Mapping of each devices' ethernet routing mode std::unordered_map> device_eth_routing_info_; - tt_device_l1_address_params l1_address_params = { - (uint32_t)MEM_NCRISC_FIRMWARE_BASE, - (uint32_t)MEM_BRISC_FIRMWARE_BASE, - (uint32_t)MEM_TRISC0_FIRMWARE_SIZE, - (uint32_t)MEM_TRISC1_FIRMWARE_SIZE, - (uint32_t)MEM_TRISC2_FIRMWARE_SIZE, - (uint32_t)MEM_TRISC0_FIRMWARE_BASE, - (uint32_t)MEM_L1_BARRIER, - (uint32_t)eth_l1_mem::address_map::ERISC_BARRIER_BASE, - (uint32_t)eth_l1_mem::address_map::FW_VERSION_ADDR, - }; - - tt_driver_eth_interface_params eth_interface_params = { - NOC_ADDR_LOCAL_BITS, - NOC_ADDR_NODE_ID_BITS, - ETH_RACK_COORD_WIDTH, - CMD_BUF_SIZE_MASK, - MAX_BLOCK_SIZE, - REQUEST_CMD_QUEUE_BASE, - RESPONSE_CMD_QUEUE_BASE, - CMD_COUNTERS_SIZE_BYTES, - REMOTE_UPDATE_PTR_SIZE_BYTES, - CMD_DATA_BLOCK, - CMD_WR_REQ, - CMD_WR_ACK, - CMD_RD_REQ, - CMD_RD_DATA, - CMD_BUF_SIZE, - CMD_DATA_BLOCK_DRAM, - ETH_ROUTING_DATA_BUFFER_ADDR, - REQUEST_ROUTING_CMD_QUEUE_BASE, - RESPONSE_ROUTING_CMD_QUEUE_BASE, - CMD_BUF_PTR_MASK}; + tt_device_l1_address_params l1_address_params; std::unordered_map>> ethernet_sockets_; }; diff --git a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp index 423098ba44f..ecb4a74a1ef 100644 --- a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp @@ -6,6 +6,8 @@ #define COMPILE_FOR_ERISC +#include + #include "llrt/hal.hpp" #include "llrt/wormhole/wh_hal.hpp" #include "hw/inc/wormhole/core_config.h" @@ -29,7 +31,7 @@ HalCoreInfoType create_active_eth_mem_map() { std::vector mem_map_bases; mem_map_bases.resize(utils::underlying_type(HalL1MemAddrType::COUNT)); - mem_map_bases[utils::underlying_type(HalL1MemAddrType::BARRIER)] = MEM_L1_BARRIER; + mem_map_bases[utils::underlying_type(HalL1MemAddrType::BARRIER)] = eth_l1_mem::address_map::ERISC_BARRIER_BASE; mem_map_bases[utils::underlying_type(HalL1MemAddrType::MAILBOX)] = eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE; mem_map_bases[utils::underlying_type(HalL1MemAddrType::LAUNCH)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[utils::underlying_type(HalL1MemAddrType::WATCHER)] = GET_ETH_MAILBOX_ADDRESS_HOST(watcher); @@ -40,10 +42,11 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_bases[utils::underlying_type(HalL1MemAddrType::CORE_INFO)] = GET_ETH_MAILBOX_ADDRESS_HOST(core_info); mem_map_bases[utils::underlying_type(HalL1MemAddrType::GO_MSG)] = GET_ETH_MAILBOX_ADDRESS_HOST(go_message); mem_map_bases[utils::underlying_type(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); + mem_map_bases[utils::underlying_type(HalL1MemAddrType::FW_VERSION_ADDR)] = eth_l1_mem::address_map::FW_VERSION_ADDR; std::vector mem_map_sizes; mem_map_sizes.resize(utils::underlying_type(HalL1MemAddrType::COUNT)); - mem_map_sizes[utils::underlying_type(HalL1MemAddrType::BARRIER)] = sizeof(uint32_t); + mem_map_sizes[utils::underlying_type(HalL1MemAddrType::BARRIER)] = eth_l1_mem::address_map::ERISC_BARRIER_SIZE; mem_map_sizes[utils::underlying_type(HalL1MemAddrType::MAILBOX)] = eth_l1_mem::address_map::ERISC_MEM_MAILBOX_SIZE; mem_map_sizes[utils::underlying_type(HalL1MemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[utils::underlying_type(HalL1MemAddrType::WATCHER)] = sizeof(watcher_msg_t); @@ -53,6 +56,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[utils::underlying_type(HalL1MemAddrType::UNRESERVED)] = eth_l1_mem::address_map::MAX_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; mem_map_sizes[utils::underlying_type(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[utils::underlying_type(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t); + mem_map_sizes[utils::underlying_type(HalL1MemAddrType::FW_VERSION_ADDR)] = sizeof(std::uint32_t); std::vector> processor_classes(NumEthDispatchClasses); std::vector processor_types{0}; diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd index 718132f4503..baed1c889ff 160000 --- a/tt_metal/third_party/umd +++ b/tt_metal/third_party/umd @@ -1 +1 @@ -Subproject commit 718132f450313cc08073f32494adcffa94fc1725 +Subproject commit baed1c889ff63e4d84b23d6d8cb3ad24a2390384 diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index fd207c920a0..1124375431c 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -199,6 +199,14 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_device_operation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/create_qkv_heads.cpp diff --git a/ttnn/cpp/ttnn/operation.hpp b/ttnn/cpp/ttnn/operation.hpp index e20faf18fdf..267571da734 100644 --- a/ttnn/cpp/ttnn/operation.hpp +++ b/ttnn/cpp/ttnn/operation.hpp @@ -287,6 +287,22 @@ constexpr bool implements_validate_with_output_tensors_and_optional_input_tensor const OptionalTensors&>; // optional output_tensors } +template +using has_compute_output_shapes_t = decltype(std::declval().compute_output_shapes(std::declval()...)); + +template +constexpr bool implements_compute_output_shapes() { + return std::experimental::is_detected_v; +} + +template +using has_compute_output_specs_t = decltype(std::declval().compute_output_specs(std::declval()...)); + +template +constexpr bool implements_compute_output_specs() { + return std::experimental::is_detected_v; +} + template using has_create_output_tensors_t = decltype(std::declval().create_output_tensors(std::declval()...)); @@ -378,13 +394,41 @@ constexpr bool implements_get_parallelization_strategy() { return std::experimental::is_detected_v; } +template +auto default_create_output_tensors( + const ConcreteOperation& operation, + const Tensors& input_tensors, + const OptionalTensors& optional_output_tensors) -> ProgramOutputTensors { + using OutputTensors = ProgramOutputTensors; + OutputTensors output_tensors; + + if (!optional_output_tensors.empty() and optional_output_tensors[0].has_value()) { + output_tensors.reserve(optional_output_tensors.size()); + for (const auto& optional_output_tensor : optional_output_tensors) { + TT_FATAL(optional_output_tensor.has_value(), "If using optional output tensors, all output tensors must have a value"); + output_tensors.emplace_back(optional_output_tensor.value()); + } + return output_tensors; + } + const auto& device = input_tensors.at(0).device(); + const auto& output_specs = operation.compute_output_specs(input_tensors); + output_tensors.reserve(output_specs.size()); + for (const auto& [output_shape, output_layout] : output_specs) { + output_tensors.emplace_back(create_device_tensor( + output_shape, + output_layout, + device)); + } + return output_tensors; +} + } // namespace detail template struct DeviceOperation final { using storage_t = std::array; using OutputTensors = OutputTensorsT; - using ComputedShapes = std::variant, std::vector>; + using ComputedShapes = std::variant, std::vector, std::vector>; inline const std::string get_type_name() const { return this->get_type_name_impl_(this->type_erased_storage); } @@ -396,6 +440,7 @@ struct DeviceOperation final { this->type_erased_storage, input_tensors, optional_input_tensors, optional_output_tensors); } + // TODO: Rename into compute_output_specs in later PR inline const ComputedShapes compute_output_shapes(const Tensors& input_tensors) const { return this->compute_output_shapes_impl_(this->type_erased_storage, input_tensors); } @@ -502,14 +547,6 @@ struct DeviceOperation final { static_assert( tt::stl::concepts::always_false_v, "You cannot implement both validate and validate_with_output_tensors"); - } else if constexpr ( - (detail::implements_validate_with_output_tensors() or - detail::implements_validate_with_output_tensors_and_optional_input_tensors()) and - not detail::implements_create_output_tensors_with_optional_output_tensors()) { - static_assert( - tt::stl::concepts::always_false_v, - "Operation doesn't implement create_output_tensors with ant optional output tensors argument " - "when using validate_with_output_tensors"); } else if constexpr (detail::implements_validate() and not detail::implements_create_program()) { static_assert( tt::stl::concepts::always_false_v, @@ -547,7 +584,19 @@ struct DeviceOperation final { compute_output_shapes_impl_{ [](const storage_t& storage, const Tensors& input_tensors) -> const ComputedShapes { const auto& operation = *reinterpret_cast*>(&storage); - return operation.compute_output_shapes(input_tensors); + if constexpr (detail::implements_compute_output_shapes() and detail::implements_compute_output_specs()) { + static_assert( + tt::stl::concepts::always_false_v, + "Operation cannot implement both compute_output_shapes and compute_output_specs"); + } else if constexpr (detail::implements_compute_output_shapes()) { + return operation.compute_output_shapes(input_tensors); + } else if constexpr (detail::implements_compute_output_specs()) { + return operation.compute_output_specs(input_tensors); + } else { + static_assert( + tt::stl::concepts::always_false_v, + "Operation must implement either compute_output_shapes or compute_output_specs"); + } }}, create_output_tensors_impl_{ [](const storage_t& storage, @@ -555,9 +604,21 @@ struct DeviceOperation final { const OptionalTensors& output_tensors) -> const OutputTensors { const auto& operation = *reinterpret_cast*>(&storage); if constexpr (detail::implements_create_output_tensors_with_optional_output_tensors()) { + static_assert( + detail::implements_compute_output_shapes(), + "Operation must implement compute_output_shapes if it implements create_output_tensors"); return operation.create_output_tensors(input_tensors, output_tensors); - } else { + } else if constexpr (detail::implements_create_output_tensors()) { + static_assert( + detail::implements_compute_output_shapes(), + "Operation must implement compute_output_shapes if it implements create_output_tensors"); return operation.create_output_tensors(input_tensors); + } else if constexpr (detail::implements_compute_output_specs()) { + return detail::default_create_output_tensors(operation, input_tensors, output_tensors); + } else { + static_assert( + tt::stl::concepts::always_false_v, + "Operation must implement either create_output_tensors or compute_output_specs"); } }}, create_program_impl_{ diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp index 386a76fef93..8ded07491f0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp @@ -82,11 +82,11 @@ void SliceDeviceOperation::validate_with_output_tensors( TT_FATAL(this->slice_start[i] <= this->slice_end[i], "Error"); } if(!output_tensors.empty() && output_tensors[0].has_value()){ - const auto output_shape_required = this->compute_output_shapes(input_tensors)[0]; + const auto output_shape_required = std::get<0>(this->compute_output_specs(input_tensors)[0]); const auto& out_tensor = output_tensors[0].value(); - TT_FATAL(out_tensor.get_legacy_shape() == output_shape_required, "The input tensors need a shape of {}, however the output tensor is only {}", output_shape_required, out_tensor.get_legacy_shape()); + TT_FATAL(out_tensor.get_padded_shape() == output_shape_required, "The input tensors need a shape of {}, however the output tensor is only {}", output_shape_required, out_tensor.get_padded_shape()); } - auto output_tensor_shape = this->compute_output_shapes(input_tensors)[0]; + auto output_tensor_shape = std::get<0>(this->compute_output_specs(input_tensors)[0]); if (has_step) { // if all ones modify before passing in to function TT_FATAL(input_tensor_a.get_layout() == Layout::ROW_MAJOR, "Strided slice is only supported for row major layout"); TT_FATAL(!input_tensor_a.is_sharded(), "Strided slice is not supported for sharded tensor"); @@ -117,40 +117,18 @@ void SliceDeviceOperation::validate_with_output_tensors( } } -std::vector SliceDeviceOperation::compute_output_shapes(const std::vector &input_tensors) const { - SmallVector out_shape; - auto rank = input_tensors[0].get_legacy_shape().rank(); - out_shape.reserve(rank); +std::vector SliceDeviceOperation::compute_output_specs(const std::vector &input_tensors) const { + const auto& input_tensor = input_tensors[0]; + SmallVector out_shape(input_tensor.get_logical_shape().rank()); auto output_dim_i = [this] (size_t i) { return (this->slice_end[i] - this->slice_start[i] + this->step[i] - 1) / this->step[i]; }; - for (uint32_t i = 0; i < rank; i++) { - out_shape.push_back(output_dim_i(i)); - } - tt::tt_metal::LegacyShape output_tensor_shape(out_shape); - return {output_tensor_shape}; -} - -std::vector SliceDeviceOperation::create_output_tensors( - const std::vector &input_tensors, const std::vector> &output_tensors) const { - if (!output_tensors.empty() && output_tensors[0].has_value()) { - return {output_tensors[0].value()}; - } - const auto &input_tensor_a = input_tensors.at(0); - const auto shapes = compute_output_shapes(input_tensors); - - if (input_tensor_a.is_sharded()) { - return {create_device_tensor( - shapes[0], - input_tensor_a.get_dtype(), - input_tensor_a.get_layout(), - input_tensor_a.device(), - this->output_mem_config)}; - } else { - return operation::generic_create_output_tensors( - *this, input_tensors, input_tensor_a.get_dtype(), input_tensor_a.get_layout(), this->output_mem_config); + for (uint32_t i = 0; i < out_shape.size(); i++) { + out_shape[i] = output_dim_i(i); } + ttnn::SimpleShape output_tensor_shape(std::move(out_shape)); + return {ttnn::TensorSpec(output_tensor_shape, tt::tt_metal::TensorLayout(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), this->output_mem_config))}; } operation::ProgramWithCallbacks SliceDeviceOperation::create_program( diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp index 5fdd6922cc9..a663db54a45 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.hpp @@ -22,8 +22,7 @@ struct SliceDeviceOperation { void validate_with_output_tensors(const std::vector& input_tensors, const std::vector>& output_tensors) const; - std::vector compute_output_shapes(const std::vector& input_tensors) const; - std::vector create_output_tensors(const std::vector& input_tensors, const std::vector>& output_tensors) const; + std::vector compute_output_specs(const std::vector& input_tensors) const; operation::ProgramWithCallbacks create_program( const std::vector& input_tensors, std::vector& output_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp index 2c420128583..6dad529d179 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp @@ -77,72 +77,57 @@ void Transpose::validate(const std::vector &input_tensors) const { } -std::vector Transpose::compute_output_shapes(const std::vector &input_tensors) const { +std::vector Transpose::compute_output_specs(const std::vector &input_tensors) const { const auto& input_tensor = input_tensors.at(0); - auto out_shape = input_tensor.get_legacy_shape(); - auto padding = out_shape.padding(); + + // TODO: Remove usage of input/output padded shape + // - Get output alignment from input alignment and output dtype, layout, mem_config + // - Get shard spec from output strides (logical shape + alignment)? + auto output_shape = input_tensor.get_logical_shape(); + auto output_padded_shape = input_tensor.get_padded_shape(); + switch (this->dim){ case TransposeOpDim::CN: - std::swap(out_shape[0], out_shape[1]); - std::swap(padding[0], padding[1]); + std::swap(output_shape[0], output_shape[1]); + std::swap(output_padded_shape[0], output_padded_shape[1]); break; case TransposeOpDim::HC: - std::swap(out_shape[1], out_shape[2]); - std::swap(padding[1], padding[2]); + std::swap(output_shape[1], output_shape[2]); + std::swap(output_padded_shape[1], output_padded_shape[2]); break; case TransposeOpDim::WH: - std::swap(out_shape[2], out_shape[3]); - std::swap(padding[2], padding[3]); + std::swap(output_shape[2], output_shape[3]); + std::swap(output_padded_shape[2], output_padded_shape[3]); break; case TransposeOpDim::NH: - std::swap(out_shape[0], out_shape[2]); - std::swap(padding[0], padding[2]); + std::swap(output_shape[0], output_shape[2]); + std::swap(output_padded_shape[0], output_padded_shape[2]); break; case TransposeOpDim::NW: - std::swap(out_shape[0], out_shape[3]); - std::swap(padding[0], padding[3]); + std::swap(output_shape[0], output_shape[3]); + std::swap(output_padded_shape[0], output_padded_shape[3]); break; case TransposeOpDim::CW: - std::swap(out_shape[1], out_shape[3]); - std::swap(padding[1], padding[3]); + std::swap(output_shape[1], output_shape[3]); + std::swap(output_padded_shape[1], output_padded_shape[3]); break; } - return {tt::tt_metal::LegacyShape(out_shape, padding)}; -} - -std::vector Transpose::create_output_tensors(const std::vector &input_tensors) const { - const auto& input_tensor = input_tensors.at(0); - // This is only for WH + auto output_mem_config = this->output_mem_config; if (this->output_mem_config.is_sharded()) { if (this->dim == TransposeOpDim::WH) { + const auto& input_padded_shape = input_tensor.get_padded_shape(); ShardSpec shard_spec = input_tensor.shard_spec().value(); - shard_spec.shape[0] = shard_spec.shape[0] / input_tensor.get_legacy_shape()[-2] * input_tensor.get_legacy_shape()[-1]; - shard_spec.shape[1] = input_tensor.get_legacy_shape()[-2]; - const auto output_shape = this->compute_output_shapes(input_tensors)[0]; - auto mem_config = this->output_mem_config; - mem_config.shard_spec = shard_spec; - return {create_device_tensor( - output_shape, - input_tensor.get_dtype(), - input_tensor.get_layout(), - input_tensor.device(), - mem_config)}; + shard_spec.shape[0] = shard_spec.shape[0] / input_padded_shape[-2] * input_padded_shape[-1]; + shard_spec.shape[1] = input_padded_shape[-2]; + output_mem_config.shard_spec = shard_spec; } else if (this->dim == TransposeOpDim::HC) { - const auto output_shape = this->compute_output_shapes(input_tensors)[0]; - auto mem_config = this->output_mem_config; - mem_config.shard_spec = input_tensor.shard_spec().value(); - return {create_device_tensor( - output_shape, - input_tensor.get_dtype(), - input_tensor.get_layout(), - input_tensor.device(), - mem_config)}; + output_mem_config.shard_spec = input_tensor.shard_spec().value(); } else { TT_ASSERT(false, "Unsupported sharding"); } } - return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), input_tensor.get_layout(), this->output_mem_config); + return {ttnn::TensorSpec(output_shape, TensorLayout::fromLegacyPaddedShape(input_tensor.get_dtype(), PageConfig(input_tensor.get_layout()), output_mem_config, ttnn::Shape(output_shape.view(), output_padded_shape.view())))}; } operation::ProgramWithCallbacks Transpose::create_program(const std::vector& input_tensors, std::vector &output_tensors) const { diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.hpp index 776509a6c80..44979d114e0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.hpp @@ -23,8 +23,7 @@ struct Transpose { const MemoryConfig output_mem_config; void validate(const std::vector &input_tensors) const; - std::vector compute_output_shapes(const std::vector &input_tensors) const; - std::vector create_output_tensors(const std::vector &input_tensors) const; + std::vector compute_output_specs(const std::vector &input_tensors) const; operation::ProgramWithCallbacks create_program(const std::vector& input_tensors, std::vector &output_tensors) const; TransposeOpParallelizationStrategy get_parallelization_strategy(const std::vector &input_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp index 9626a678f6a..5033425fd38 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp @@ -81,7 +81,7 @@ void bind_primitive_binary_operation(py::module& module, const binary_operation_ template -void bind_binary_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& info=". ", const std::string& note=" ") { +void bind_binary_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& info=". ", const std::string& supported_dtype="BFLOAT16", const std::string& note=" ") { auto doc = fmt::format( R"doc( {2} @@ -106,7 +106,19 @@ void bind_binary_operation(py::module& module, const binary_operation_t& operati Supports broadcasting. Note: - {5} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {5} + - TILE + - 2, 3, 4 + + {6} Example: @@ -119,6 +131,7 @@ void bind_binary_operation(py::module& module, const binary_operation_t& operati description, math, info, + supported_dtype, note); bind_registered_operation( @@ -173,7 +186,7 @@ void bind_binary_operation(py::module& module, const binary_operation_t& operati } template -void bind_binary_composite(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& note="") { +void bind_binary_composite(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype = "BFLAOT16", const std::string& note="") { auto doc = fmt::format( R"doc( {2} @@ -192,7 +205,19 @@ void bind_binary_composite(py::module& module, const binary_operation_t& operati ttnn.Tensor: the output tensor. Note: - {4} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {4} + - TILE + - 2, 3, 4 + + {5} Example: >>> tensor1 = ttnn.to_device(ttnn.from_torch(torch.tensor(([[1, 2], [3, 4]]), dtype=torch.bfloat16)), device=device) @@ -205,6 +230,7 @@ void bind_binary_composite(py::module& module, const binary_operation_t& operati operation.python_fully_qualified_name(), description, math, + supported_dtype, note); bind_registered_operation( @@ -471,7 +497,7 @@ void bind_div(py::module& module, const binary_operation_t& operation, const std } template -void bind_polyval(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& note=" ") { +void bind_polyval(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype="BFLOAT16", const std::string& note=" ") { auto doc = fmt::format( R"doc( {2} @@ -490,7 +516,19 @@ void bind_polyval(py::module& module, const binary_operation_t& operation, const ttnn.Tensor: the output tensor. Note: - {4} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {4} + - TILE + - 2, 3, 4 + + {5} Example: >>> tensor = ttnn.to_device(ttnn.from_torch(torch.tensor(([[1, 2], [3, 4]]), dtype=torch.bfloat16)), device=device) @@ -503,6 +541,7 @@ void bind_polyval(py::module& module, const binary_operation_t& operation, const operation.python_fully_qualified_name(), description, math, + supported_dtype, note); bind_registered_operation( @@ -577,7 +616,7 @@ void bind_binary_overload_operation(py::module& module, const binary_operation_t } template -void bind_inplace_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math) { +void bind_inplace_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype="BFLOAT16", const std::string& note="") { auto doc = fmt::format( R"doc( {2} @@ -592,6 +631,21 @@ void bind_inplace_operation(py::module& module, const binary_operation_t& operat Returns: ttnn.Tensor: the output tensor. + Note: + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {4} + - TILE + - 2, 3, 4 + + {5} + Example: >>> tensor = ttnn.from_torch(torch.tensor(([[1, 2], [3, 4]]), dtype=torch.bfloat16), device=device) >>> output = {1}(tensor1, tensor2) @@ -599,7 +653,9 @@ void bind_inplace_operation(py::module& module, const binary_operation_t& operat operation.base_name(), operation.python_fully_qualified_name(), description, - math); + math, + supported_dtype, + note); bind_registered_operation( module, @@ -626,7 +682,7 @@ void bind_inplace_operation(py::module& module, const binary_operation_t& operat } template -void bind_logical_inplace_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& note=" ") { +void bind_logical_inplace_operation(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math, const std::string& supported_dtype = "BFLOAT16", const std::string& note=" ") { auto doc = fmt::format( R"doc( {2} @@ -642,7 +698,19 @@ void bind_logical_inplace_operation(py::module& module, const binary_operation_t ttnn.Tensor: the output tensor. Note: - {4} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {4} + - TILE + - 2, 3, 4 + + {5} Example: >>> tensor1 = ttnn.from_torch(torch.tensor(([[1, 2], [3, 4]]), dtype=torch.bfloat16), device=device) @@ -653,6 +721,7 @@ void bind_logical_inplace_operation(py::module& module, const binary_operation_t operation.python_fully_qualified_name(), description, math, + supported_dtype, note); bind_registered_operation( @@ -731,7 +800,7 @@ void py_module(py::module& module) { ttnn::add, R"doc(Adds :attr:`input_tensor_a` to :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i + \mathrm{{input\_tensor\_b}}_i))doc", - R"doc(: :code:`'None'` | :code:`'relu'`. )doc"); + R"doc(: :code:`'None'` | :code:`'relu'`. )doc", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_inplace_operation( module, @@ -745,14 +814,7 @@ void py_module(py::module& module) { R"doc(Subtracts :attr:`input_tensor_b` from :attr:`input_tensor_a` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i - \mathrm{{input\_tensor\_b}}_i))doc", R"doc(: :code:`'None'` | :code:`'relu'`. )doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_inplace_operation( module, @@ -765,15 +827,7 @@ void py_module(py::module& module) { ttnn::multiply, R"doc(Multiplies :attr:`input_tensor_a` by :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i * \mathrm{{input\_tensor\_b}}_i))doc", - R"doc(: :code:`'None'` | :code:`'relu'`. )doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(: :code:`'None'` | :code:`'relu'`. )doc", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_inplace_operation( module, @@ -821,91 +875,37 @@ void py_module(py::module& module) { module, ttnn::logical_and, R"doc(Compute logical AND of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i \& \mathrm{{input\_tensor\_b}}_i))doc",". ", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i \& \mathrm{{input\_tensor\_b}}_i))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_operation( module, ttnn::logical_or, R"doc(Compute logical OR of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i | \mathrm{{input\_tensor\_b}}_i))doc",". ", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i | \mathrm{{input\_tensor\_b}}_i))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_operation( module, ttnn::ldexp, R"doc(Compute ldexp of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc(\mathrm{{output\_tensor}} = ldexp(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(\mathrm{{output\_tensor}} = ldexp(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_operation( module, ttnn::logaddexp, R"doc(Compute logaddexp of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc(\mathrm{{output\_tensor}} = logaddexp(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(\mathrm{{output\_tensor}} = logaddexp(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_operation( module, ttnn::logaddexp2, R"doc(Compute logaddexp2 of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc(\mathrm{{output\_tensor}} = logaddexp2(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(\mathrm{{output\_tensor}} = logaddexp2(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc",". ", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_operation( module, ttnn::squared_difference, R"doc(Compute squared difference of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc(\mathrm{{output\_tensor}} = \verb|squared_difference|(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc", ". ", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(\mathrm{{output\_tensor}} = \verb|squared_difference|(\mathrm{{input\_tensor\_a,input\_tensor\_b}}))doc", ". ", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_operation( module, @@ -939,14 +939,6 @@ void py_module(py::module& module) { ttnn::xlogy, R"doc(Compute xlogy :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", R"doc(\mathrm{output\_tensor}_i = \mathrm{input\_tensor\_a}_i \cdot \log(\mathrm{input\_tensor\_b}_i) - )doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16 | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ )doc"); detail::bind_binary_composite( @@ -973,59 +965,25 @@ void py_module(py::module& module) { ttnn::logical_xor, R"doc(Compute logical_xor :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", R"doc(\mathrm{output\_tensor}_i = (\mathrm{input\_tensor\_a}_i \land \lnot \mathrm{input\_tensor\_b}_i) \lor (\lnot \mathrm{input\_tensor\_a}_i \land \mathrm{input\_tensor\_b}_i))doc",".", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_logical_inplace_operation( module, ttnn::logical_or_, R"doc(Compute inplace logical OR of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc((\mathrm{{input\_tensor\_a}}_i | \mathrm{{input\_tensor\_b}}_i))doc", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc((\mathrm{{input\_tensor\_a}}_i | \mathrm{{input\_tensor\_b}}_i))doc",R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_logical_inplace_operation( module, ttnn::logical_xor_, R"doc(Compute inplace logical XOR of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc((\mathrm{input\_tensor\_a}_i \land \lnot \mathrm{input\_tensor\_b}_i) \lor (\lnot \mathrm{input\_tensor\_a}_i \land \mathrm{input\_tensor\_b}_i))doc", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc((\mathrm{input\_tensor\_a}_i \land \lnot \mathrm{input\_tensor\_b}_i) \lor (\lnot \mathrm{input\_tensor\_a}_i \land \mathrm{input\_tensor\_b}_i))doc", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_logical_inplace_operation( module, ttnn::logical_and_, R"doc(Compute inplace logical AND of :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", - R"doc((\mathrm{{input\_tensor\_a}}_i \& \mathrm{{input\_tensor\_b}}_i))doc", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + R"doc((\mathrm{{input\_tensor\_a}}_i \& \mathrm{{input\_tensor\_b}}_i))doc", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_composite( module, @@ -1105,16 +1063,7 @@ void py_module(py::module& module) { ttnn::polyval, R"doc(Compute polyval of all elements of :attr:`input_tensor_a` with coefficient :attr:`coeffs` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", R"doc(\mathrm{output\_tensor} = \sum_{i=0}^{n} (\mathrm{coeffs}_i) (\mathrm{input\_tensor}^i) - )doc", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - )doc"); + )doc",R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_overload_operation( module, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp index 22e09bdaac3..b99897762e7 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_backward/binary_backward_pybind.hpp @@ -22,7 +22,7 @@ namespace binary_backward { namespace detail { template -void bind_binary_backward_ops(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_backward_ops(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( {2} @@ -39,10 +39,19 @@ void bind_binary_backward_ops(py::module& module, const binary_backward_operatio List of ttnn.Tensor: the output tensor. Note: - {3} + Supported dtypes, layouts, and ranks: - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + .. list-table:: + :header-rows: 1 + * - Dtypes + - Layouts + - Ranks + * - {3} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device=device) @@ -79,7 +88,7 @@ void bind_binary_backward_ops(py::module& module, const binary_backward_operatio } template -void bind_binary_backward_int_default(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, int parameter_value, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_backward_int_default(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, int parameter_value, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( {5} @@ -104,9 +113,19 @@ void bind_binary_backward_int_default(py::module& module, const binary_backward_ Note: - {6} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + * - Dtypes + - Layouts + - Ranks + * - {6} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: @@ -157,7 +176,7 @@ void bind_binary_backward_int_default(py::module& module, const binary_backward_ } template -void bind_binary_backward_opt_float_default(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, float parameter_value, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_backward_opt_float_default(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, float parameter_value, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( {5} @@ -183,9 +202,19 @@ void bind_binary_backward_opt_float_default(py::module& module, const binary_bac Note: - {6} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + * - Dtypes + - Layouts + - Ranks + * - {6} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: @@ -246,7 +275,7 @@ void bind_binary_backward_float_string_default( const std::string& parameter_b_doc, string parameter_b_value, const std::string& description, - const std::string& supported_dtype) { + const std::string& supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( @@ -265,9 +294,19 @@ void bind_binary_backward_float_string_default( List of ttnn.Tensor: the output tensor. Note: - {8} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + * - Dtypes + - Layouts + - Ranks + * - {8} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: @@ -323,7 +362,7 @@ void bind_binary_backward_float_string_default( } template -void bind_binary_backward_sub_alpha(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, float parameter_value, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_backward_sub_alpha(py::module& module, const binary_backward_operation_t& operation, const std::string& parameter_name, const std::string& parameter_doc, float parameter_value, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( @@ -343,9 +382,19 @@ void bind_binary_backward_sub_alpha(py::module& module, const binary_backward_op queue_id (int, optional): command queue id. Defaults to `0`. Note: - {6} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + * - Dtypes + - Layouts + - Ranks + * - {6} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: @@ -394,7 +443,7 @@ void bind_binary_backward_sub_alpha(py::module& module, const binary_backward_op } template -void bind_binary_backward_rsub(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_backward_rsub(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( @@ -414,9 +463,19 @@ void bind_binary_backward_rsub(py::module& module, const binary_backward_operati queue_id (int, optional): command queue id. Defaults to `0`. Note: - {3} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + * - Dtypes + - Layouts + - Ranks + * - {3} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: @@ -460,7 +519,7 @@ void bind_binary_backward_rsub(py::module& module, const binary_backward_operati } template -void bind_binary_bw_mul(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_bw_mul(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( {2} @@ -481,9 +540,19 @@ void bind_binary_bw_mul(py::module& module, const binary_backward_operation_t& o List of ttnn.Tensor: the output tensor. Note: - {3} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + * - Dtypes + - Layouts + - Ranks + * - {3} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device) @@ -561,7 +630,7 @@ void bind_binary_bw_mul(py::module& module, const binary_backward_operation_t& o template -void bind_binary_bw(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_bw(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( @@ -581,9 +650,19 @@ void bind_binary_bw(py::module& module, const binary_backward_operation_t& opera queue_id (int, optional): command queue id. Defaults to `0`. Note: - {3} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + * - Dtypes + - Layouts + - Ranks + * - {3} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device) @@ -662,7 +741,7 @@ void bind_binary_bw(py::module& module, const binary_backward_operation_t& opera } template -void bind_binary_bw_div(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_bw_div(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( @@ -687,9 +766,19 @@ void bind_binary_bw_div(py::module& module, const binary_backward_operation_t& o Supports broadcasting. Note: - {3} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 - Note : bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT + * - Dtypes + - Layouts + - Ranks + * - {3} + - TILE + - 2, 3, 4 + + bfloat8_b/bfloat4_b is only supported on TILE_LAYOUT Example: >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device) @@ -770,7 +859,7 @@ void bind_binary_bw_div(py::module& module, const binary_backward_operation_t& o } template -void bind_binary_backward_overload(py::module& module, const binary_backward_operation_t& operation, const std::string& description, const std::string& supported_dtype) { +void bind_binary_backward_overload(py::module& module, const binary_backward_operation_t& operation, const std::string& description, const std::string& supported_dtype = "BFLOAT16", const std::string& note = "") { auto doc = fmt::format( R"doc( @@ -788,7 +877,19 @@ void bind_binary_backward_overload(py::module& module, const binary_backward_ope List of ttnn.Tensor: the output tensor. Note: - {3} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {3} + - TILE + - 2, 3, 4 + + {4} Example: @@ -800,7 +901,7 @@ void bind_binary_backward_overload(py::module& module, const binary_backward_ope operation.base_name(), operation.python_fully_qualified_name(), description, - supported_dtype); + supported_dtype, note); bind_registered_operation( module, @@ -838,7 +939,7 @@ void bind_binary_backward_overload(py::module& module, const binary_backward_ope } template -void bind_binary_backward_assign(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "") { +void bind_binary_backward_assign(py::module& module, const binary_backward_operation_t& operation, const std::string_view description, const std::string_view supported_dtype = "BFLOAT16") { auto doc = fmt::format( R"doc( @@ -858,7 +959,17 @@ void bind_binary_backward_assign(py::module& module, const binary_backward_opera round_mode (str, optional): Round mode for the operation. Defaults to `None`. Note: - {3} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {3} + - TILE + - 2, 3, 4 Example: >>> grad_tensor = ttnn.to_device(ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16)), device) @@ -924,288 +1035,126 @@ void py_module(py::module& module) { module, ttnn::mul_bw, R"doc(Performs backward operations for multiply on :attr:`input_tensor_a`, :attr:`input_tensor_b`, with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_bw( module, ttnn::add_bw, R"doc(Performs backward operations for add of :attr:`input_tensor_a` and :attr:`input_tensor_b` or :attr:`scalar` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_bw( module, ttnn::sub_bw, R"doc(Performs backward operations for subtract of :attr:`input_tensor_a` and :attr:`input_tensor_b` or :attr:`scalar` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_bw_div( module, ttnn::div_bw, R"doc(Performs backward operations for divide on :attr:`input_tensor`, :attr:`alpha` or :attr:`input_tensor_a`, :attr:`input_tensor_b`, :attr:`round_mode`, with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_backward_overload( module, ttnn::remainder_bw, R"doc(Performs backward operations for remainder of :attr:`input_tensor_a`, :attr:`scalar` or :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16 | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - supported only in WHB0. - - )doc"); + R"doc(BFLOAT16)doc", R"doc(Supported only in WHB0.)doc"); detail::bind_binary_backward_overload( module, ttnn::fmod_bw, R"doc(Performs backward operations for fmod of :attr:`input_tensor_a`, :attr:`scalar` or :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16 | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16)doc"); detail::bind_binary_backward_assign( module, ttnn::assign_bw, - R"doc(Performs backward operations for assign of :attr:`input_tensor_a`, :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | ROW_MAJOR, TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(Performs backward operations for assign of :attr:`input_tensor_a`, :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_backward_ops( module, ttnn::atan2_bw, R"doc(Performs backward operations for atan2 of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_backward_sub_alpha( module, ttnn::subalpha_bw, "alpha", "Alpha value", 1.0f, R"doc(Performs backward operations for subalpha of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_backward_opt_float_default( module, ttnn::addalpha_bw, "alpha", "Alpha value", 1.0f, R"doc(Performs backward operations for addalpha on :attr:`input_tensor_b` , :attr:`input_tensor_a` and :attr:`alpha` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_backward_ops( module, ttnn::xlogy_bw, R"doc(Performs backward operations for xlogy of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: + R"doc(BFLOAT16, BFLOAT8_B)doc"); - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16 | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); detail::bind_binary_backward_ops( module, ttnn::hypot_bw, R"doc(Performs backward operations for hypot of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ + R"doc(BFLOAT16, BFLOAT8_B)doc"); - )doc"); detail::bind_binary_backward_ops( module, ttnn::ldexp_bw, R"doc(Performs backward operations for ldexp of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: + R"doc(BFLOAT16)doc"); - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); detail::bind_binary_backward_ops( module, ttnn::logaddexp_bw, R"doc(Performs backward operations for logaddexp of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ + R"doc(BFLOAT16)doc"); - )doc"); detail::bind_binary_backward_ops( module, ttnn::logaddexp2_bw, R"doc(Performs backward operations for logaddexp2 of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16)doc"); detail::bind_binary_backward_ops( module, ttnn::squared_difference_bw, R"doc(Performs backward operations for squared_difference of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: + R"doc(BFLOAT16, BFLOAT8_B)doc"); - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); detail::bind_binary_backward_int_default( module, ttnn::concat_bw, "dim", "Dimension to concatenate", 0, R"doc(Performs backward operations for concat on :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16 | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16)doc"); detail::bind_binary_backward_rsub( module, ttnn::rsub_bw, R"doc(Performs backward operations for subraction of :attr:`input_tensor_a` from :attr:`input_tensor_b` with given :attr:`grad_tensor` (reversed order of subtraction operator).)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_binary_backward_ops( module, ttnn::min_bw, - R"doc(Performs backward operations for minimum of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16 | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(Performs backward operations for minimum of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc"); detail::bind_binary_backward_ops( module, ttnn::max_bw, - R"doc(Performs backward operations for maximum of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16 | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(Performs backward operations for maximum of :attr:`input_tensor_a` and :attr:`input_tensor_b` with given :attr:`grad_tensor`.)doc"); detail::bind_binary_backward_float_string_default( module, @@ -1217,15 +1166,7 @@ void py_module(py::module& module) { "none", R"doc(Performs backward operations for bias_gelu on :attr:`input_tensor_a` and :attr:`input_tensor_b` or :attr:`input_tensor` and :attr:`bias`, with given :attr:`grad_tensor` using given :attr:`approximate` mode. :attr:`approximate` mode can be 'none', 'tanh'.)doc", - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + R"doc(BFLOAT16, BFLOAT8_B)doc"); } } // namespace binary_backward diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp index b8f7e3c1c10..255ca459504 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp @@ -149,7 +149,7 @@ void UnaryDeviceOperation::validate_on_program_cache_miss( shape_return_value_t UnaryDeviceOperation::compute_output_shapes( const operation_attributes_t&, const tensor_args_t& tensor_args) { - return {tensor_args.input.get_shape()}; + return {tensor_args.input.get_logical_shape()}; } tensor_return_value_t UnaryDeviceOperation::create_output_tensors( @@ -158,13 +158,12 @@ tensor_return_value_t UnaryDeviceOperation::create_output_tensors( return tensor_args.preallocated_output.value(); } - const auto output_shape = compute_output_shapes(args, tensor_args); - auto output_layout = Layout::TILE; if (args.output_memory_config.is_sharded()) { output_layout = tensor_args.input.get_layout(); } + const auto output_shape = tensor_args.input.shape(); return create_device_tensor( output_shape, args.output_dtype, output_layout, tensor_args.input.device(), args.output_memory_config); } diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp index f600d7317f5..95d100a9c85 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation_types.hpp @@ -27,6 +27,6 @@ struct tensor_args_t { using tensor_return_value_t = Tensor; -using shape_return_value_t = ttnn::Shape; +using shape_return_value_t = ttnn::SimpleShape; } // namespace ttnn::operations::unary diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp index 04011c73f1d..4a720efa199 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp @@ -897,7 +897,7 @@ void bind_identity(py::module& module, const unary_operation_t& operation) { } template -void bind_power(py::module& module, const unary_operation_t& operation, const std::string& info_doc = "") { +void bind_power(py::module& module, const unary_operation_t& operation, const std::string& supported_dtype="BFLOAT16", const std::string& info_doc = "") { auto doc = fmt::format( R"doc( Applies {0} to :attr:`input_tensor` element-wise. @@ -918,7 +918,19 @@ void bind_power(py::module& module, const unary_operation_t& operation, const st ttnn.Tensor: the output tensor. Note: - {2} + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {2} + - TILE + - 2, 3, 4 + + {3} Example: >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device) @@ -926,6 +938,7 @@ void bind_power(py::module& module, const unary_operation_t& operation, const st )doc", ttnn::pow.base_name(), ttnn::pow.python_fully_qualified_name(), + supported_dtype, info_doc); bind_registered_operation( @@ -1789,16 +1802,7 @@ void py_module(py::module& module) { detail::bind_sigmoid_accurate(module, ttnn::sigmoid_accurate); detail::bind_unary_chain(module, ttnn::unary_chain); detail::bind_identity(module, ttnn::identity); - detail::bind_power(module, ttnn::pow, - R"doc(Supported dtypes, layouts, and ranks: - - +----------------------------+---------------------------------+-------------------+ - | Dtypes | Layouts | Ranks | - +----------------------------+---------------------------------+-------------------+ - | BFLOAT16, BFLOAT8_B | TILE | 2, 3, 4 | - +----------------------------+---------------------------------+-------------------+ - - )doc"); + detail::bind_power(module, ttnn::pow, R"doc(BFLOAT16, BFLOAT8_B)doc"); // unary composite imported into ttnn detail::bind_unary_composite(module, ttnn::deg2rad, R"doc(Performs deg2rad function on :attr:`input_tensor`.)doc"); diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp index 4fd3da1951b..c3ed75dcb6f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp @@ -163,7 +163,7 @@ std::vector all_gather_matmul( /* Matmul setup */ - bool user_run_batched = ttnn::operations::matmul::detail::is_input_batched(weight_tensor.get_shape()); + bool user_run_batched = ttnn::operations::matmul::detail::is_input_batched(weight_tensor.get_logical_shape()); std::optional user_core_coord; if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); diff --git a/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp index 19b097d0b21..8f5c68b23de 100644 --- a/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/experimental_pybind.cpp @@ -20,6 +20,8 @@ #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads/nlp_create_qkv_heads_pybind.hpp" #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/nlp_create_qkv_heads_decode_pybind.hpp" #include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/nlp_create_qkv_heads_falcon7b_pybind.hpp" +#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp" +#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp" #include "ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/nlp_kv_cache_load_slice_pybind.hpp" #include "ttnn/operations/experimental/paged_cache/paged_cache_pybind.hpp" #include "ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding_pybind.hpp" @@ -44,6 +46,8 @@ void py_module(py::module& module) { transformer::detail::bind_nlp_concat_heads_decode(module); transformer::detail::bind_nlp_create_qkv_heads_decode(module); transformer::detail::bind_nlp_create_qkv_heads_falcon7b(module); + transformer::detail::bind_nlp_create_qkv_heads_vit(module); + transformer::detail::bind_nlp_create_qkv_heads_segformer(module); transformer::detail::bind_nlp_kv_cache_load_slice(module); transformer::py_bind_rotary_embedding(module); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp new file mode 100644 index 00000000000..092c948382d --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + + +void kernel_main() { + // READER RUNTIME ARGS + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in1_tensor_addr = get_arg_val(1); + uint32_t num_blocks = get_arg_val(2); + uint32_t in0_tensor_tile_id = get_arg_val(3); + uint32_t in1_tensor_tile_id = get_arg_val(4); + + // COMPILE TIME ARGS + // interleaved accessor args + constexpr uint32_t in0_is_dram = get_compile_time_arg_val(0); + constexpr uint32_t in1_is_dram = get_compile_time_arg_val(1); + // READER COMPILE TIME ARGS + constexpr uint32_t q_num_tiles = get_compile_time_arg_val(2); + + + constexpr uint32_t cb_id_qv = 1; // cb for Q, V heads + + constexpr uint32_t onetile = 1; + const uint32_t single_tile_size_bytes = get_tile_size(cb_id_qv); + const DataFormat data_format = get_dataformat(cb_id_qv); + + constexpr bool in0_is_dram_bool = in0_is_dram == 1; + const InterleavedAddrGenFast s0 = { + .bank_base_address = in0_tensor_addr, + .page_size = single_tile_size_bytes, + .data_format = data_format, + }; + + + for (uint32_t block = 0; block < num_blocks; block++) { + // Q + for (uint32_t i = 0; i < q_num_tiles; i++) { + cb_reserve_back(cb_id_qv, onetile); + uint32_t l1_write_addr = get_write_ptr(cb_id_qv); + noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr); + noc_async_read_barrier(); + cb_push_back(cb_id_qv, onetile); + in0_tensor_tile_id++; + } + } +} diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp new file mode 100644 index 00000000000..e00a56b5c67 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp @@ -0,0 +1,75 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include "dataflow_api.h" + + +void kernel_main() { + // WRITER RUNTIME ARGS + uint32_t q_tensor_addr = get_arg_val(0); + uint32_t num_blocks = get_arg_val(1); + uint32_t q_out_h_dim = get_arg_val(2); + uint32_t q_out_tensor_tile_id = get_arg_val(3); + + // COMPILE TIME ARGS + // interleaved accessor args + constexpr uint32_t out_is_dram = get_compile_time_arg_val(0); + constexpr uint32_t q_out_h_tiles = get_compile_time_arg_val(1); + constexpr uint32_t q_out_w_tiles = get_compile_time_arg_val(2); + constexpr uint32_t q_out_HtWt = get_compile_time_arg_val(3); + constexpr uint32_t q_out_c = get_compile_time_arg_val(4); + + constexpr uint32_t cb_id_qv = 1; // cb for Q, V heads tiles + + const uint32_t single_tile_size_bytes = get_tile_size(cb_id_qv); + const DataFormat data_format = get_dataformat(cb_id_qv); + + constexpr bool out_is_dram_bool = out_is_dram == 1; + const InterleavedAddrGenFast sq = { + .bank_base_address = q_tensor_addr, + .page_size = single_tile_size_bytes, + .data_format = data_format + }; + + constexpr uint32_t block_size = 1; // micro-block size for read/write; nothing to do with num_blocks + // TODO: This might negatively impact perf + constexpr uint32_t out_num_tiles_read = block_size; // always read and pop by micro-block size for generality + uint32_t l1_read_addr; + uint32_t q_out_tensor_current_tile_id; // need this to update q_out_tensor_tile_id + uint32_t out_tensor_current_tile_id_along_c; + + for (uint32_t block = 0; block < num_blocks; block++) { + // q + create q head --> outputs: [B, num_q_heads, S, head_dim] + out_tensor_current_tile_id_along_c = q_out_tensor_tile_id; + for (uint32_t c_dim = 0; c_dim < q_out_c; c_dim++) { + q_out_tensor_current_tile_id = out_tensor_current_tile_id_along_c; + for (uint32_t w_dim = 0; w_dim < q_out_w_tiles; w_dim++) { + cb_wait_front(cb_id_qv, out_num_tiles_read); + l1_read_addr = get_read_ptr(cb_id_qv); + noc_async_write_tile(q_out_tensor_current_tile_id, sq, l1_read_addr); + + noc_async_write_barrier(); + cb_pop_front(cb_id_qv, out_num_tiles_read); + + q_out_tensor_current_tile_id++; + } + out_tensor_current_tile_id_along_c += q_out_HtWt; + } + + // Update out_tensor_tile_id for next h_dim or batch if we finish one CHtWt + q_out_h_dim++; + if (q_out_h_dim < q_out_h_tiles) { + q_out_tensor_tile_id += q_out_w_tiles; + } else { + // If we finish one batch, always roll over to next tile in memory + // This is just the current_tile_id, except for K when we transpose heads + // In this case, decrement k_out_tensor_current_tile_id by the stride (q_out_h_tiles) and add 1 to roll over + q_out_tensor_tile_id = q_out_tensor_current_tile_id; + q_out_h_dim = 0; + } + } + +} diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp new file mode 100644 index 00000000000..c3dcffdc410 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "nlp_create_qkv_heads_segformer_device_operation.hpp" + +#include "tt_metal/common/work_split.hpp" + +namespace ttnn::operations::experimental::transformer { + +// Hard-coded for Segformer +void NlpCreateHeadsSegformerDeviceOperation::validate(const std::vector& input_tensors) const { + const auto& input_tensor = input_tensors.at(0); + const auto input_shape = input_tensor.get_legacy_shape(); + + TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to TM need to be on device!"); + TT_FATAL(input_tensor.buffer() != nullptr, "Operands to TM need to be allocated in buffers on device!"); + TT_FATAL(input_tensor.get_dtype() == tt::tt_metal::DataType::FLOAT32 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT16 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT8_B, "Unsupported data format"); + TT_FATAL(input_tensor.get_layout() == Layout::TILE, "Error"); + + TT_FATAL(input_shape[2] % tt::constants::TILE_HEIGHT == 0, "Error"); + TT_FATAL(input_shape[3] % tt::constants::TILE_HEIGHT == 0, "Error"); + //TT_FATAL((input_shape == tt::tt_metal::LegacyShape({input_shape[0], 1, input_shape[2], 2304})), "Unsupported input shape"); + TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::INTERLEAVED, "Error"); +} + +std::vector NlpCreateHeadsSegformerDeviceOperation::compute_output_shapes(const std::vector& input_tensors) const { + std::vector output_shape_vec; + const auto& input_tensor = input_tensors.at(0); + const auto input_shape = input_tensor.get_legacy_shape(); + const auto head_dim = 32; // head_dim is hard-coded = 32 + auto num_heads = input_shape[3] / tt::constants::TILE_HEIGHT; // head_dim is hard-coded = 32 + output_shape_vec = {(tt::tt_metal::LegacyShape) {input_shape[0], num_heads, input_shape[2], head_dim}, (tt::tt_metal::LegacyShape) {input_shape[0], num_heads, input_shape[2], head_dim}, (tt::tt_metal::LegacyShape) {input_shape[0], num_heads, input_shape[2], head_dim}}; + //output_shape_vec = {(tt::tt_metal::LegacyShape) {input_shape[0], num_heads, input_shape[2], head_dim}, }; + + return output_shape_vec; +} + +std::vector NlpCreateHeadsSegformerDeviceOperation::create_output_tensors(const std::vector& input_tensors) const { + const auto& input_tensor = input_tensors.at(0); + if (this->output_mem_config.is_sharded()) { + TT_ASSERT(false); + return {}; + } else { + return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), Layout::TILE, this->output_mem_config); + } +} + +operation::ProgramWithCallbacks NlpCreateHeadsSegformerDeviceOperation::create_program(const std::vector& input_tensors, std::vector &output_tensors) const { + const auto& input_tensor = input_tensors.at(0); + auto& output_tensor = output_tensors.at(0); + + CoreCoord compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size(); + + return multi_core_nlp_create_qkv_heads_segformer(input_tensor, output_tensors, compute_with_storage_grid_size); +} +} // namespace ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp new file mode 100644 index 00000000000..66fb3c45953 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.hpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "ttnn/run_operation.hpp" +#include + +#include "ttnn/common/constants.hpp" +#include "ttnn/tensor/tensor.hpp" +#include "ttnn/device_operation.hpp" + +namespace ttnn::operations::experimental::transformer { + +operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_segformer(const Tensor &input_tensor_a, std::vector &output, CoreCoord compute_with_storage_grid_size); + +struct NlpCreateHeadsSegformerDeviceOperation { + MemoryConfig output_mem_config; + + void validate(const std::vector& input_tensors) const; + std::vector compute_output_shapes(const std::vector& input_tensors) const; + std::vector create_output_tensors(const std::vector& input_tensors) const; + operation::ProgramWithCallbacks create_program( + const std::vector& input_tensors, std::vector& output_tensors) const; +}; + +} // namespace ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp new file mode 100644 index 00000000000..ef409dae2cf --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_program_factory.cpp @@ -0,0 +1,195 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_metal/host_api.hpp" +#include "tt_metal/common/constants.hpp" +#include "tt_metal/detail/util.hpp" +#include "nlp_create_qkv_heads_segformer_device_operation.hpp" +#include "tt_metal/common/work_split.hpp" + +namespace ttnn::operations::experimental::transformer { + +using namespace tt::constants; +using namespace tt; +using namespace tt::tt_metal; + + +operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_segformer(const Tensor &a, std::vector& output, CoreCoord compute_with_storage_grid_size) { + + const auto& ashape = a.get_legacy_shape(); + + tt_metal::Device *device = a.device(); + + tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); + + uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format); + tt_metal::Buffer *in0_buffer = a.buffer(); + TT_ASSERT(in0_buffer->size() % single_tile_size == 0); + // Dummy + tt_metal::Buffer *in1_buffer; + uint32_t in1_buffer_addr = 0; + + + //////////////////////////////////////////////////////////////////////////// + // TM Parameters Setup + //////////////////////////////////////////////////////////////////////////// + const uint32_t head_dim = 32; + uint32_t per_tensor_tiles = ashape[3] / TILE_WIDTH ; + const uint32_t q_num_tiles_per_tensor = per_tensor_tiles; + const uint32_t num_q_heads = q_num_tiles_per_tensor; //hard-coding the head_dim = 32 + + // Per output tensor args + // Output shape for Q/K/V is: [B, head_num, s, 32] # Needs shuffling from [B, 1, s, hidden_dim] + uint32_t q_out_h_tiles = ashape[2] / TILE_WIDTH; + uint32_t q_out_w_tiles = 1; //hard-coding the head_dim = 32 + uint32_t q_out_c = q_num_tiles_per_tensor / q_out_w_tiles; // num_heads + uint32_t q_out_HtWt = q_out_h_tiles * q_out_w_tiles; + uint32_t q_out_CHtWt = q_out_c * q_out_HtWt; + uint32_t q_num_tiles = num_q_heads * q_out_w_tiles; + + uint32_t num_cores_x = compute_with_storage_grid_size.x; + uint32_t num_cores_y = compute_with_storage_grid_size.y; + // Block is a unit of work; ie. num of per_tensor_tiles per core + uint32_t num_blocks = ashape[0] * ashape[1] * ashape[2] / TILE_HEIGHT; + auto [num_cores, all_cores, core_group_1, core_group_2, num_blocks_per_core_group_1, num_blocks_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_blocks); + + + //////////////////////////////////////////////////////////////////////////// + // Grayskull Device Setup + //////////////////////////////////////////////////////////////////////////// + TT_ASSERT((output.size() == 1), "Output vector must be size 1 !"); + tt_metal::Tensor& q = output[0]; + tt_metal::Tensor& k = output[1]; + tt_metal::Tensor& v = output[2]; + + tt_metal::Buffer *q_buffer = q.buffer(); + TT_ASSERT(q_buffer != nullptr, "Output q buffer should be allocated on device!"); + tt_metal::Buffer *k_buffer = k.buffer(); + TT_ASSERT(k_buffer != nullptr, "Output k buffer should be allocated on device!"); + tt_metal::Buffer *v_buffer = v.buffer(); + TT_ASSERT(v_buffer != nullptr, "Output v buffer should be allocated on device!"); + + + //////////////////////////////////////////////////////////////////////////// + // Application Setup + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program program = tt_metal::CreateProgram(); + + bool tile_dtype_is_bfloat16 = a.get_dtype() == tt::tt_metal::DataType::BFLOAT16; + bool in0_is_dram = in0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + bool out_is_dram = q_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + bool in1_is_dram = false; + + std::vector reader_compile_time_args = { + // interleaved accessor args + (std::uint32_t) in0_is_dram, + (std::uint32_t) in1_is_dram, + (std::uint32_t) q_num_tiles, + }; + std::vector writer_compile_time_args = { + // interleaved accessor args + (std::uint32_t) out_is_dram, + (std::uint32_t) q_out_h_tiles, + (std::uint32_t) q_out_w_tiles, + (std::uint32_t) q_out_HtWt, + (std::uint32_t) num_q_heads, // q_out_c + }; + + ///////////// K transpose //////////////////// + const bool transpose_k_heads = false; + std::map reader_defines; + std::map writer_defines; + + ////////////////////////////////////////////// + auto reader_kernel_id = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp", + all_cores, + tt_metal::ReaderDataMovementConfig(reader_compile_time_args, reader_defines)); + + auto writer_kernel_id = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp", + all_cores, + tt_metal::WriterDataMovementConfig(writer_compile_time_args, writer_defines)); + + + + // Create circular buffers + uint32_t src1_cb_index = 1; + uint32_t cb0_num_tiles = per_tensor_tiles * 2; // double buffer + tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(cb0_num_tiles * single_tile_size, {{src1_cb_index, cb_data_format}}) + .set_page_size(src1_cb_index, single_tile_size); + auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config); + + for (uint32_t i = 0, num_blocks_written = 0; i < num_cores; i++){ + CoreCoord core = {i / num_cores_y, i % num_cores_y}; + uint32_t num_blocks_per_core = 0; + if (core_group_1.contains(core)) { + num_blocks_per_core = num_blocks_per_core_group_1; + } else if (core_group_2.contains(core)) { + num_blocks_per_core = num_blocks_per_core_group_2; + } else { + TT_ASSERT(false, "Core not in specified core ranges"); + } + + std::vector reader_runtime_args = { + (std::uint32_t) in0_buffer->address(), + (std::uint32_t) in1_buffer_addr, + num_blocks_per_core, + num_blocks_written * per_tensor_tiles, + 0, + }; + + uint32_t q_out_h_dim = num_blocks_written % q_out_h_tiles; + uint32_t q_out_tensor_tile_id = num_blocks_written / q_out_h_tiles * q_out_CHtWt + q_out_h_dim * q_out_w_tiles; + + std::vector writer_runtime_args = { + (std::uint32_t) q_buffer->address(), // q_tensor_addr + num_blocks_per_core, // num_blocks + q_out_h_dim, + q_out_tensor_tile_id, + }; + + tt_metal::SetRuntimeArgs(program, reader_kernel_id, core, reader_runtime_args); + tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args); + num_blocks_written += num_blocks_per_core; + } + + auto override_runtime_args_callback = [ + reader_kernel_id, + writer_kernel_id, + num_cores, + num_cores_y + ] + ( + const Program &program, + const std::vector& input_buffers, + const std::vector& output_buffers + ) { + + auto src_dram_buffer = input_buffers.at(0); + + auto dst_dram_buffer_query = output_buffers.at(0); + + for (uint32_t i = 0, num_blocks_written = 0; i < num_cores; i++){ + CoreCoord core = {i / num_cores_y, i % num_cores_y}; + + { + auto &runtime_args = GetRuntimeArgs(program, reader_kernel_id, core); + runtime_args[0] = src_dram_buffer->address(); + } + + { + auto &runtime_args = GetRuntimeArgs(program, writer_kernel_id, core); + runtime_args[0] = dst_dram_buffer_query->address(); + } + } + }; + + return {std::move(program), override_runtime_args_callback}; +} + + +} // ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp new file mode 100644 index 00000000000..176192091a0 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "nlp_create_qkv_heads_segformer.hpp" + +namespace ttnn::operations::experimental::transformer { + + std::tuple NLPCreateHeadsSegformerOperation::invoke ( + uint8_t queue_id, + const Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>> optional_output_tensors) { + const MemoryConfig output_mem_config = memory_config.value_or(input_tensor_q.memory_config()); + auto optional_outputs = std::vector>{}; + if (optional_output_tensors.has_value()) { + optional_outputs = {optional_output_tensors.value().begin(), optional_output_tensors.value().end()}; + } + else { + optional_outputs = {}; + } + auto outputs = operation::run(NlpCreateHeadsSegformerDeviceOperation{output_mem_config}, {input_tensor_q}, {}, optional_outputs); + return {outputs[0], outputs[1], outputs[2]}; + //return {outputs[0]} + }; + + std::tuple NLPCreateHeadsSegformerOperation::invoke ( + const Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>> optional_output_tensors) { + return invoke(ttnn::DefaultQueueId, input_tensor_q, memory_config, optional_output_tensors); + }; + +} // namespace ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp new file mode 100644 index 00000000000..4ac1ec051ec --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "device/nlp_create_qkv_heads_segformer_device_operation.hpp" +#include "ttnn/run_operation.hpp" +#include "ttnn/operations/core/core.hpp" + +namespace ttnn { +namespace operations::experimental::transformer { + +struct NLPCreateHeadsSegformerOperation { + static std::tuple invoke ( + uint8_t queue_id, + const Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>> optional_output_tensors = std::nullopt); + + static std::tuple invoke ( + const Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>> optional_output_tensors = std::nullopt); +}; +} // namespace operations::experimental::transformer + + +namespace experimental { + +constexpr auto nlp_create_qkv_heads_segformer = ttnn::register_operation_with_auto_launch_op< + "ttnn::experimental::nlp_create_qkv_heads_segformer", + ttnn::operations::experimental::transformer::NLPCreateHeadsSegformerOperation>(); + +} // namespace experimental + +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp new file mode 100644 index 00000000000..adf07afb76a --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "ttnn/cpp/pybind11/decorators.hpp" + +#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer.hpp" +#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp" + + +namespace ttnn::operations::experimental::transformer::detail { + +void bind_nlp_create_qkv_heads_segformer(pybind11::module& module) { + ttnn::bind_registered_operation( + module, + ttnn::experimental::nlp_create_qkv_heads_segformer, + R"doc( + Shuffles [B, 1, S, 2304] fused qkv matrix into 3 heads with shapes [B, 12, S, 64], [B, 12, S, 64], and [B, 12, S, 64]. + )doc", + ttnn::pybind_overload_t{ + [] (const decltype(ttnn::experimental::nlp_create_qkv_heads_segformer) &self, + const ttnn::Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>>& optional_output_tensors, + uint8_t queue_id) { + return self(queue_id, input_tensor_q, memory_config, optional_output_tensors); + }, + pybind11::arg("input").noconvert(), + pybind11::kw_only(), + pybind11::arg("memory_config").noconvert() = std::nullopt, + pybind11::arg("output_tensors").noconvert() = std::nullopt, + pybind11::arg("queue_id") = 0}); +}; +} // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp new file mode 100644 index 00000000000..5d528d396e5 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/nlp_create_qkv_heads_segformer_pybind.hpp @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "pybind11/pybind_fwd.hpp" + + +namespace ttnn::operations::experimental::transformer::detail { + +void bind_nlp_create_qkv_heads_segformer(pybind11::module& module); +} // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp new file mode 100644 index 00000000000..6405790b53b --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" + + +void kernel_main() { + // READER RUNTIME ARGS + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in1_tensor_addr = get_arg_val(1); + uint32_t num_blocks = get_arg_val(2); + uint32_t in0_tensor_tile_id = get_arg_val(3); + uint32_t in1_tensor_tile_id = get_arg_val(4); + + // COMPILE TIME ARGS + // interleaved accessor args + constexpr uint32_t in0_is_dram = get_compile_time_arg_val(0); + constexpr uint32_t in1_is_dram = get_compile_time_arg_val(1); + // READER COMPILE TIME ARGS + constexpr uint32_t q_num_tiles = get_compile_time_arg_val(2); + constexpr uint32_t kv_num_tiles = get_compile_time_arg_val(3); + + + constexpr uint32_t cb_id_qv = 1; // cb for Q, V heads + #ifdef TRANSPOSE_K_HEADS + constexpr uint32_t cb_id_k = 0; // cb for K heads (used by compute) + #else + constexpr uint32_t cb_id_k = 1; // cb for K heads (directly to writer) + #endif + + constexpr uint32_t onetile = 1; + const uint32_t single_tile_size_bytes = get_tile_size(cb_id_qv); + const DataFormat data_format = get_dataformat(cb_id_qv); + + constexpr bool in0_is_dram_bool = in0_is_dram == 1; + const InterleavedAddrGenFast s0 = { + .bank_base_address = in0_tensor_addr, + .page_size = single_tile_size_bytes, + .data_format = data_format, + }; + + #ifdef READ_FROM_INPUT_TENSOR_KV + constexpr bool in1_is_dram_bool = in1_is_dram == 1; + const InterleavedAddrGenFast s1 = { + .bank_base_address = in1_tensor_addr, + .page_size = single_tile_size_bytes, + .data_format = data_format, + }; + #endif + + + for (uint32_t block = 0; block < num_blocks; block++) { + // Q + for (uint32_t i = 0; i < q_num_tiles; i++) { + cb_reserve_back(cb_id_qv, onetile); + uint32_t l1_write_addr = get_write_ptr(cb_id_qv); + noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr); + noc_async_read_barrier(); + cb_push_back(cb_id_qv, onetile); + in0_tensor_tile_id++; + } + + // K + for (uint32_t i = 0; i < kv_num_tiles; i++) { + cb_reserve_back(cb_id_k, onetile); + uint32_t l1_write_addr = get_write_ptr(cb_id_k); + #ifdef READ_FROM_INPUT_TENSOR_KV + noc_async_read_tile(in1_tensor_tile_id, s1, l1_write_addr); + in1_tensor_tile_id++; + #else + noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr); + in0_tensor_tile_id++; + #endif + noc_async_read_barrier(); + cb_push_back(cb_id_k, onetile); + } + + // V + for (uint32_t i = 0; i < kv_num_tiles; i++) { + cb_reserve_back(cb_id_qv, onetile); + uint32_t l1_write_addr = get_write_ptr(cb_id_qv); + #ifdef READ_FROM_INPUT_TENSOR_KV + noc_async_read_tile(in1_tensor_tile_id, s1, l1_write_addr); + in1_tensor_tile_id++; + #else + noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr); + in0_tensor_tile_id++; + #endif + noc_async_read_barrier(); + cb_push_back(cb_id_qv, onetile); + } + } +} diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp new file mode 100644 index 00000000000..a8cd96bb20d --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include "dataflow_api.h" + + +void kernel_main() { + // WRITER RUNTIME ARGS + uint32_t q_tensor_addr = get_arg_val(0); + uint32_t k_tensor_addr = get_arg_val(1); + uint32_t v_tensor_addr = get_arg_val(2); + uint32_t num_blocks = get_arg_val(3); + uint32_t q_out_h_dim = get_arg_val(4); + uint32_t q_out_tensor_tile_id = get_arg_val(5); + uint32_t k_out_tensor_tile_id = get_arg_val(6); + uint32_t v_out_tensor_tile_id = get_arg_val(7); + + // COMPILE TIME ARGS + // interleaved accessor args + constexpr uint32_t out_is_dram = get_compile_time_arg_val(0); + constexpr uint32_t q_out_h_tiles = get_compile_time_arg_val(1); + constexpr uint32_t q_out_w_tiles = get_compile_time_arg_val(2); + constexpr uint32_t q_out_HtWt = get_compile_time_arg_val(3); + constexpr uint32_t q_out_c = get_compile_time_arg_val(4); + constexpr uint32_t kv_out_c = get_compile_time_arg_val(5); + + + constexpr uint32_t cb_id_qv = 1; // cb for Q, V heads tiles + #ifdef TRANSPOSE_K_HEADS + constexpr uint32_t cb_id_k = 16; // cb for K heads (filled by compute) + #else + constexpr uint32_t cb_id_k = 1; // cb for K heads (directly from reader) + #endif + const uint32_t single_tile_size_bytes = get_tile_size(cb_id_qv); + const DataFormat data_format = get_dataformat(cb_id_qv); + + constexpr bool out_is_dram_bool = out_is_dram == 1; + const InterleavedAddrGenFast sq = { + .bank_base_address = q_tensor_addr, + .page_size = single_tile_size_bytes, + .data_format = data_format + }; + const InterleavedAddrGenFast sk = { + .bank_base_address = k_tensor_addr, + .page_size = single_tile_size_bytes, + .data_format = data_format + }; + const InterleavedAddrGenFast sv = { + .bank_base_address = v_tensor_addr, + .page_size = single_tile_size_bytes, + .data_format = data_format + }; + + constexpr uint32_t block_size = 1; // micro-block size for read/write; nothing to do with num_blocks + // TODO: This might negatively impact perf + constexpr uint32_t out_num_tiles_read = block_size; // always read and pop by micro-block size for generality + uint32_t l1_read_addr; + uint32_t q_out_tensor_current_tile_id; // need this to update q_out_tensor_tile_id + uint32_t k_out_tensor_current_tile_id; // need this to update k_out_tensor_tile_id + uint32_t v_out_tensor_current_tile_id; // need this to update v_out_tensor_tile_id + uint32_t out_tensor_current_tile_id_along_c; + + for (uint32_t block = 0; block < num_blocks; block++) { + // q + create q head --> outputs: [B, num_q_heads, S, head_dim] + out_tensor_current_tile_id_along_c = q_out_tensor_tile_id; + for (uint32_t c_dim = 0; c_dim < q_out_c; c_dim++) { + q_out_tensor_current_tile_id = out_tensor_current_tile_id_along_c; + for (uint32_t w_dim = 0; w_dim < q_out_w_tiles; w_dim++) { + cb_wait_front(cb_id_qv, out_num_tiles_read); + l1_read_addr = get_read_ptr(cb_id_qv); + noc_async_write_tile(q_out_tensor_current_tile_id, sq, l1_read_addr); + + noc_async_write_barrier(); + cb_pop_front(cb_id_qv, out_num_tiles_read); + + q_out_tensor_current_tile_id++; + } + out_tensor_current_tile_id_along_c += q_out_HtWt; + } + + // k + create k head --> outputs: [B, num_kv_heads, S, head_dim] + #ifndef TRANSPOSE_K_HEADS + out_tensor_current_tile_id_along_c = k_out_tensor_tile_id; + #else + k_out_tensor_current_tile_id = k_out_tensor_tile_id; + #endif + for (uint32_t c_dim = 0; c_dim < kv_out_c; c_dim++) { + #ifndef TRANSPOSE_K_HEADS + k_out_tensor_current_tile_id = out_tensor_current_tile_id_along_c; + #endif + for (uint32_t w_dim = 0; w_dim < q_out_w_tiles; w_dim++) { + cb_wait_front(cb_id_k, out_num_tiles_read); + l1_read_addr = get_read_ptr(cb_id_k); + noc_async_write_tile(k_out_tensor_current_tile_id, sk, l1_read_addr); + + noc_async_write_barrier(); + cb_pop_front(cb_id_k, out_num_tiles_read); + + #ifndef TRANSPOSE_K_HEADS + k_out_tensor_current_tile_id++; + #else + k_out_tensor_current_tile_id += q_out_h_tiles; + #endif + } + #ifndef TRANSPOSE_K_HEADS + out_tensor_current_tile_id_along_c += q_out_HtWt; + #endif + } + + // v + create v head --> outputs: [B, num_kv_heads, S, head_dim] + out_tensor_current_tile_id_along_c = v_out_tensor_tile_id; + for (uint32_t c_dim = 0; c_dim < kv_out_c; c_dim++) { + v_out_tensor_current_tile_id = out_tensor_current_tile_id_along_c; + for (uint32_t w_dim = 0; w_dim < q_out_w_tiles; w_dim++) { + cb_wait_front(cb_id_qv, out_num_tiles_read); + l1_read_addr = get_read_ptr(cb_id_qv); + noc_async_write_tile(v_out_tensor_current_tile_id, sv, l1_read_addr); + + noc_async_write_barrier(); + cb_pop_front(cb_id_qv, out_num_tiles_read); + + v_out_tensor_current_tile_id++; + } + out_tensor_current_tile_id_along_c += q_out_HtWt; + } + + // Update out_tensor_tile_id for next h_dim or batch if we finish one CHtWt + q_out_h_dim++; + if (q_out_h_dim < q_out_h_tiles) { + q_out_tensor_tile_id += q_out_w_tiles; + #ifndef TRANSPOSE_K_HEADS + k_out_tensor_tile_id += q_out_w_tiles; + #else + k_out_tensor_tile_id++; + #endif + v_out_tensor_tile_id += q_out_w_tiles; + } else { + // If we finish one batch, always roll over to next tile in memory + // This is just the current_tile_id, except for K when we transpose heads + // In this case, decrement k_out_tensor_current_tile_id by the stride (q_out_h_tiles) and add 1 to roll over + q_out_tensor_tile_id = q_out_tensor_current_tile_id; + #ifndef TRANSPOSE_K_HEADS + k_out_tensor_tile_id = k_out_tensor_current_tile_id; + #else + k_out_tensor_tile_id = ++k_out_tensor_current_tile_id - q_out_h_tiles; // inc by 1 and decrement by stride + #endif + v_out_tensor_tile_id = v_out_tensor_current_tile_id; + q_out_h_dim = 0; + } + } + +} diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp new file mode 100644 index 00000000000..f528f4bf966 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "nlp_create_qkv_heads_vit_device_operation.hpp" + +#include "tt_metal/common/work_split.hpp" + +namespace ttnn::operations::experimental::transformer { + +// Hard-coded for Vit +void NlpCreateHeadsVitDeviceOperation::validate(const std::vector& input_tensors) const { + const auto& input_tensor = input_tensors.at(0); + const auto input_shape = input_tensor.get_legacy_shape(); + + TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to TM need to be on device!"); + TT_FATAL(input_tensor.buffer() != nullptr, "Operands to TM need to be allocated in buffers on device!"); + TT_FATAL(input_tensor.get_dtype() == tt::tt_metal::DataType::FLOAT32 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT16 || input_tensor.get_dtype() == tt::tt_metal::DataType::BFLOAT8_B, "Unsupported data format"); + TT_FATAL(input_tensor.get_layout() == Layout::TILE, "Error"); + + TT_FATAL(input_shape[2] % tt::constants::TILE_HEIGHT == 0, "Error"); + TT_FATAL((input_shape == tt::tt_metal::LegacyShape({input_shape[0], 1, input_shape[2], 2304})), "Unsupported input shape"); + TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::INTERLEAVED, "Error"); +} + +std::vector NlpCreateHeadsVitDeviceOperation::compute_output_shapes(const std::vector& input_tensors) const { + std::vector output_shape_vec; + const auto& input_tensor = input_tensors.at(0); + const auto input_shape = input_tensor.get_legacy_shape(); + output_shape_vec = {(tt::tt_metal::LegacyShape) {input_shape[0], 12, input_shape[2], 64}, (tt::tt_metal::LegacyShape) {input_shape[0], 12, input_shape[2], 64}, (tt::tt_metal::LegacyShape) {input_shape[0], 12, input_shape[2], 64}}; + return output_shape_vec; +} + +std::vector NlpCreateHeadsVitDeviceOperation::create_output_tensors(const std::vector& input_tensors) const { + const auto& input_tensor = input_tensors.at(0); + if (this->output_mem_config.is_sharded()) { + TT_ASSERT(false); + return {}; + } else { + return operation::generic_create_output_tensors(*this, input_tensors, input_tensor.get_dtype(), Layout::TILE, this->output_mem_config); + } +} + +operation::ProgramWithCallbacks NlpCreateHeadsVitDeviceOperation::create_program(const std::vector& input_tensors, std::vector &output_tensors) const { + const auto& input_tensor = input_tensors.at(0); + auto& output_tensor = output_tensors.at(0); + + CoreCoord compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size(); + + return multi_core_nlp_create_qkv_heads_vit(input_tensor, output_tensors, compute_with_storage_grid_size); +} +} // namespace ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp new file mode 100644 index 00000000000..2891d6a27ec --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.hpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "ttnn/run_operation.hpp" +#include + +#include "ttnn/common/constants.hpp" +#include "ttnn/tensor/tensor.hpp" +#include "ttnn/device_operation.hpp" + +namespace ttnn::operations::experimental::transformer { + +operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_vit(const Tensor &input_tensor_a, std::vector &output, CoreCoord compute_with_storage_grid_size); + +struct NlpCreateHeadsVitDeviceOperation { + MemoryConfig output_mem_config; + + void validate(const std::vector& input_tensors) const; + std::vector compute_output_shapes(const std::vector& input_tensors) const; + std::vector create_output_tensors(const std::vector& input_tensors) const; + operation::ProgramWithCallbacks create_program( + const std::vector& input_tensors, std::vector& output_tensors) const; +}; + +} // namespace ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp new file mode 100644 index 00000000000..1e91f35399e --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_program_factory.cpp @@ -0,0 +1,249 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt_metal/host_api.hpp" +#include "tt_metal/common/constants.hpp" +#include "tt_metal/detail/util.hpp" +#include "nlp_create_qkv_heads_vit_device_operation.hpp" +#include "tt_metal/common/work_split.hpp" + +namespace ttnn::operations::experimental::transformer { + +using namespace tt::constants; +using namespace tt; +using namespace tt::tt_metal; + + +operation::ProgramWithCallbacks multi_core_nlp_create_qkv_heads_vit(const Tensor &a, std::vector& output, CoreCoord compute_with_storage_grid_size) { + + const auto& ashape = a.get_legacy_shape(); + + tt_metal::Device *device = a.device(); + + tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); + + uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format); + tt_metal::Buffer *in0_buffer = a.buffer(); + TT_ASSERT(in0_buffer->size() % single_tile_size == 0); + // Dummy + tt_metal::Buffer *in1_buffer; + uint32_t in1_buffer_addr = 0; + + + //////////////////////////////////////////////////////////////////////////// + // TM Parameters Setup + //////////////////////////////////////////////////////////////////////////// + uint32_t per_tensor_tiles = ashape[3] / TILE_WIDTH ; // 72 + const uint32_t q_num_tiles_per_tensor = 24; + const uint32_t kv_num_tiles_per_tensor = 24; + const uint32_t num_q_heads = 12; + const uint32_t num_kv_heads = 12; + + // Per output tensor args + // Output shape for Q,K,V is: [B, 12, s, 64] # Needs shuffling from [B, 1, s, 2304] + uint32_t q_out_h_tiles = ashape[2] / TILE_HEIGHT; + uint32_t q_out_w_tiles = 2; // head_dim + uint32_t q_out_c = q_num_tiles_per_tensor / q_out_w_tiles; // num_heads + uint32_t q_out_HtWt = q_out_h_tiles * q_out_w_tiles; + uint32_t q_out_CHtWt = q_out_c * q_out_HtWt; + uint32_t kv_out_CHtWt = num_kv_heads * q_out_HtWt; + uint32_t q_num_tiles = num_q_heads * q_out_w_tiles; + uint32_t kv_num_tiles = num_kv_heads * q_out_w_tiles; + + uint32_t num_cores_x = compute_with_storage_grid_size.x; + uint32_t num_cores_y = compute_with_storage_grid_size.y; + // Block is a unit of work; ie. num of per_tensor_tiles per core + uint32_t num_blocks = ashape[0] * ashape[1] * ashape[2] / TILE_HEIGHT; + auto [num_cores, all_cores, core_group_1, core_group_2, num_blocks_per_core_group_1, num_blocks_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_blocks); + + + //////////////////////////////////////////////////////////////////////////// + // Grayskull Device Setup + //////////////////////////////////////////////////////////////////////////// + TT_ASSERT((output.size() == 3), "Output vector must be size 3 for split fused qkv!"); + tt_metal::Tensor& q = output[0]; + tt_metal::Tensor& k = output[1]; + tt_metal::Tensor& v = output[2]; + + tt_metal::Buffer *q_buffer = q.buffer(); + TT_ASSERT(q_buffer != nullptr, "Output q buffer should be allocated on device!"); + tt_metal::Buffer *k_buffer = k.buffer(); + TT_ASSERT(k_buffer != nullptr, "Output k buffer should be allocated on device!"); + tt_metal::Buffer *v_buffer = v.buffer(); + TT_ASSERT(v_buffer != nullptr, "Output v buffer should be allocated on device!"); + + + //////////////////////////////////////////////////////////////////////////// + // Application Setup + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program program = tt_metal::CreateProgram(); + + bool tile_dtype_is_bfloat16 = a.get_dtype() == tt::tt_metal::DataType::BFLOAT16; + bool in0_is_dram = in0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + bool out_is_dram = q_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + bool in1_is_dram = false; + + std::vector reader_compile_time_args = { + // interleaved accessor args + (std::uint32_t) in0_is_dram, + (std::uint32_t) in1_is_dram, + (std::uint32_t) q_num_tiles, + (std::uint32_t) kv_num_tiles, + }; + std::vector writer_compile_time_args = { + // interleaved accessor args + (std::uint32_t) out_is_dram, + (std::uint32_t) q_out_h_tiles, + (std::uint32_t) q_out_w_tiles, + (std::uint32_t) q_out_HtWt, + (std::uint32_t) num_q_heads, // q_out_c + (std::uint32_t) num_kv_heads, // kv_out_c + }; + + ///////////// K transpose //////////////////// + const bool transpose_k_heads = false; + std::map reader_defines; + std::map writer_defines; + if (transpose_k_heads) { + std::vector compute_args_core_group_1 = {num_blocks_per_core_group_1 * kv_num_tiles}; + auto compute_kernel_id_group_1 = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp", + core_group_1, + tt_metal::ComputeConfig{.compile_args = compute_args_core_group_1} + ); + + if (core_group_2.num_cores() > 0) { + std::vector compute_args_core_group_2 = {num_blocks_per_core_group_2 * kv_num_tiles}; + auto compute_kernel_id_group_2 = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp", + core_group_2, + tt_metal::ComputeConfig{.compile_args = compute_args_core_group_2} + ); + } + reader_defines["TRANSPOSE_K_HEADS"] = "1"; + writer_defines["TRANSPOSE_K_HEADS"] = "1"; + } + ////////////////////////////////////////////// + + auto reader_kernel_id = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp", + all_cores, + tt_metal::ReaderDataMovementConfig(reader_compile_time_args, reader_defines)); + + auto writer_kernel_id = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp", + all_cores, + tt_metal::WriterDataMovementConfig(writer_compile_time_args, writer_defines)); + + + + // Create circular buffers + uint32_t src1_cb_index = 1; + uint32_t cb0_num_tiles = per_tensor_tiles * 2; // double buffer + tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(cb0_num_tiles * single_tile_size, {{src1_cb_index, cb_data_format}}) + .set_page_size(src1_cb_index, single_tile_size); + auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config); + + // If we transpose_k_heads: + // - reader will write to cb0, instead of cb1 + // - compute will wait on cb0 and write to cb16 + // - writer will wait on cb 16, instead of cb1 + if (transpose_k_heads) { + uint32_t src0_cb_index = 0; + uint32_t cb0_num_tiles = per_tensor_tiles * 2; // double buffer + tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(cb0_num_tiles * single_tile_size, {{src0_cb_index, cb_data_format}}) + .set_page_size(src0_cb_index, single_tile_size); + auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config); + + uint32_t out_cb_index = 16; + uint32_t out_cb_num_tiles = per_tensor_tiles * 2; // double buffer + tt_metal::CircularBufferConfig cb_out_config = tt_metal::CircularBufferConfig(out_cb_num_tiles * single_tile_size, {{out_cb_index, cb_data_format}}) + .set_page_size(out_cb_index, single_tile_size); + auto cb_out = tt_metal::CreateCircularBuffer(program, all_cores, cb_out_config); + } + + for (uint32_t i = 0, num_blocks_written = 0; i < num_cores; i++){ + CoreCoord core = {i / num_cores_y, i % num_cores_y}; + uint32_t num_blocks_per_core = 0; + if (core_group_1.contains(core)) { + num_blocks_per_core = num_blocks_per_core_group_1; + } else if (core_group_2.contains(core)) { + num_blocks_per_core = num_blocks_per_core_group_2; + } else { + TT_ASSERT(false, "Core not in specified core ranges"); + } + + std::vector reader_runtime_args = { + (std::uint32_t) in0_buffer->address(), + (std::uint32_t) in1_buffer_addr, + num_blocks_per_core, + num_blocks_written * per_tensor_tiles, + 0, + }; + + uint32_t q_out_h_dim = num_blocks_written % q_out_h_tiles; + uint32_t q_out_tensor_tile_id = num_blocks_written / q_out_h_tiles * q_out_CHtWt + q_out_h_dim * q_out_w_tiles; + uint32_t v_out_tensor_tile_id = num_blocks_written / q_out_h_tiles * kv_out_CHtWt + q_out_h_dim * q_out_w_tiles; + uint32_t k_out_tensor_tile_id = transpose_k_heads ? num_blocks_written / q_out_h_tiles * kv_out_CHtWt + q_out_h_dim : v_out_tensor_tile_id; + + std::vector writer_runtime_args = { + (std::uint32_t) q_buffer->address(), // q_tensor_addr + (std::uint32_t) k_buffer->address(), // k_tensor_addr + (std::uint32_t) v_buffer->address(), // v_tensor_addr + num_blocks_per_core, // num_blocks + q_out_h_dim, + q_out_tensor_tile_id, + k_out_tensor_tile_id, + v_out_tensor_tile_id, + }; + + tt_metal::SetRuntimeArgs(program, reader_kernel_id, core, reader_runtime_args); + tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, writer_runtime_args); + num_blocks_written += num_blocks_per_core; + } + + auto override_runtime_args_callback = [ + reader_kernel_id, + writer_kernel_id, + num_cores, + num_cores_y + ] + ( + const Program &program, + const std::vector& input_buffers, + const std::vector& output_buffers + ) { + + auto src_dram_buffer = input_buffers.at(0); + + auto dst_dram_buffer_query = output_buffers.at(0); + auto dst_dram_buffer_key = output_buffers.at(1); + auto dst_dram_buffer_value = output_buffers.at(2); + + for (uint32_t i = 0, num_blocks_written = 0; i < num_cores; i++){ + CoreCoord core = {i / num_cores_y, i % num_cores_y}; + + { + auto &runtime_args = GetRuntimeArgs(program, reader_kernel_id, core); + runtime_args[0] = src_dram_buffer->address(); + } + + { + auto &runtime_args = GetRuntimeArgs(program, writer_kernel_id, core); + runtime_args[0] = dst_dram_buffer_query->address(); + runtime_args[1] = dst_dram_buffer_key->address(); + runtime_args[2] = dst_dram_buffer_value->address(); + } + } + }; + + return {std::move(program), override_runtime_args_callback}; +} + + +} // ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp new file mode 100644 index 00000000000..55afb0f3daf --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.cpp @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "nlp_create_qkv_heads_vit.hpp" + +namespace ttnn::operations::experimental::transformer { + + std::tuple NLPCreateHeadsVitOperation::invoke ( + uint8_t queue_id, + const Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>> optional_output_tensors) { + const MemoryConfig output_mem_config = memory_config.value_or(input_tensor_q.memory_config()); + auto optional_outputs = std::vector>{}; + if (optional_output_tensors.has_value()) { + optional_outputs = {optional_output_tensors.value().begin(), optional_output_tensors.value().end()}; + } + else { + optional_outputs = {}; + } + auto outputs = operation::run(NlpCreateHeadsVitDeviceOperation{output_mem_config}, {input_tensor_q}, {}, optional_outputs); + return {outputs[0], outputs[1], outputs[2]}; + }; + + std::tuple NLPCreateHeadsVitOperation::invoke ( + const Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>> optional_output_tensors) { + return invoke(ttnn::DefaultQueueId, input_tensor_q, memory_config, optional_output_tensors); + }; + +} // namespace ttnn::operations::experimental::transformer diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp new file mode 100644 index 00000000000..cb8ba8197e1 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "device/nlp_create_qkv_heads_vit_device_operation.hpp" +#include "ttnn/run_operation.hpp" +#include "ttnn/operations/core/core.hpp" + +namespace ttnn { +namespace operations::experimental::transformer { + +struct NLPCreateHeadsVitOperation { + static std::tuple invoke ( + uint8_t queue_id, + const Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>> optional_output_tensors = std::nullopt); + + static std::tuple invoke ( + const Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>> optional_output_tensors = std::nullopt); +}; +} // namespace operations::experimental::transformer + + +namespace experimental { + +constexpr auto nlp_create_qkv_heads_vit = ttnn::register_operation_with_auto_launch_op< + "ttnn::experimental::nlp_create_qkv_heads_vit", + ttnn::operations::experimental::transformer::NLPCreateHeadsVitOperation>(); + +} // namespace experimental + +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp new file mode 100644 index 00000000000..0fde58745b4 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "ttnn/cpp/pybind11/decorators.hpp" + +#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit.hpp" +#include "ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp" + + +namespace ttnn::operations::experimental::transformer::detail { + +void bind_nlp_create_qkv_heads_vit(pybind11::module& module) { + ttnn::bind_registered_operation( + module, + ttnn::experimental::nlp_create_qkv_heads_vit, + R"doc( + Shuffles [B, 1, S, 2304] fused qkv matrix into 3 heads with shapes [B, 12, S, 64], [B, 12, S, 64], and [B, 12, S, 64]. + )doc", + ttnn::pybind_overload_t{ + [] (const decltype(ttnn::experimental::nlp_create_qkv_heads_vit) &self, + const ttnn::Tensor& input_tensor_q, + const std::optional& memory_config, + std::optional>>& optional_output_tensors, + uint8_t queue_id) { + return self(queue_id, input_tensor_q, memory_config, optional_output_tensors); + }, + pybind11::arg("input").noconvert(), + pybind11::kw_only(), + pybind11::arg("memory_config").noconvert() = std::nullopt, + pybind11::arg("output_tensors").noconvert() = std::nullopt, + pybind11::arg("queue_id") = 0}); +}; +} // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp new file mode 100644 index 00000000000..49f53dc5ca9 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/nlp_create_qkv_heads_vit_pybind.hpp @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "pybind11/pybind_fwd.hpp" + + +namespace ttnn::operations::experimental::transformer::detail { + +void bind_nlp_create_qkv_heads_vit(pybind11::module& module); +} // namespace ttnn::operations::experimental::transformer::detail diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp index 93e36fee7f2..2e7144040f2 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.hpp @@ -21,7 +21,6 @@ namespace operations { namespace matmul { using ttnn::operations::unary::UnaryWithParam; -using tt::tt_metal::LegacyShape; /* * GENERAL MATMUL AND BMM @@ -170,8 +169,6 @@ struct Matmul { const std::vector &input_tensors, const std::vector> &optional_input_tensors) const; std::vector compute_output_shapes(const std::vector &input_tensors) const; - std::vector compute_output_shapes_dram_sharded( - const std::vector &input_tensors, uint32_t N_unpadded) const; std::vector create_output_tensors(const std::vector &input_tensors) const; operation::ProgramWithCallbacks create_program( const std::vector &input_tensors, diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp index c2d14cddedc..5bc05ffedb3 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.cpp @@ -18,7 +18,7 @@ namespace matmul { namespace detail { -bool is_input_batched(const ttnn::Shape& shape) { +bool is_input_batched(const ttnn::SimpleShape& shape) { auto is_batched = false; for (auto i = 0; i < shape.rank() - 2; ++i) { if (shape[i] > 1) { @@ -109,7 +109,7 @@ Tensor MatmulOperation::invoke( if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); } - bool user_run_batched = detail::is_input_batched(input_tensor_b.get_shape()); + bool user_run_batched = detail::is_input_batched(input_tensor_b.get_logical_shape()); return bound_matmul( input_tensor_a, input_tensor_b, @@ -147,7 +147,7 @@ Tensor LinearOperation::invoke( if (core_grid.has_value()) { user_core_coord = CoreCoord(core_grid->x, core_grid->y); } - bool b_is_batched = detail::is_input_batched(input_tensor_b.get_shape()); + bool b_is_batched = detail::is_input_batched(input_tensor_b.get_logical_shape()); TT_FATAL(!(b_is_batched && bias.has_value()), "Batched input not supported when bias exists (linear operation)."); return bound_matmul( diff --git a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp index eb450bd2896..1501b005471 100644 --- a/ttnn/cpp/ttnn/operations/matmul/matmul.hpp +++ b/ttnn/cpp/ttnn/operations/matmul/matmul.hpp @@ -22,7 +22,7 @@ namespace matmul { namespace detail { -bool is_input_batched(const ttnn::Shape& shape); +bool is_input_batched(const ttnn::SimpleShape& logical_shape); } // namespace detail diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp index 0278c3da6c3..2fc0719ec97 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.cpp @@ -57,46 +57,39 @@ void Reduce::validate(const std::vector& input_tensors) const { } } -std::vector Reduce::compute_output_shapes(const std::vector& input_tensors) const { +std::vector Reduce::compute_output_specs(const std::vector& input_tensors) const { const auto& input_tensor = input_tensors.at(0); - auto output_shape = input_tensor.get_legacy_shape(); - auto padding = output_shape.padding(); + // TODO: Remove usage of input/output padded shape + // - Get output alignment from input alignment and output dtype, layout, mem_config + // - Get shard spec from output strides (logical shape + alignment)? + auto output_shape = input_tensor.get_logical_shape(); + auto output_padded_shape = input_tensor.get_padded_shape(); switch (this->dim) { case ReduceOpDim::H: - output_shape[2] = TILE_HEIGHT; - padding[2] = Padding::PadDimension{0, 31}; + output_shape[2] = 1; + output_padded_shape[2] = TILE_HEIGHT; break; case ReduceOpDim::W: - output_shape[3] = TILE_WIDTH; - padding[3] = Padding::PadDimension{0, 31}; + output_shape[3] = 1; + output_padded_shape[3] = TILE_WIDTH; break; case ReduceOpDim::HW: - output_shape[2] = TILE_HEIGHT; - output_shape[3] = TILE_WIDTH; - padding[2] = Padding::PadDimension{0, 31}; - padding[3] = Padding::PadDimension{0, 31}; + output_shape[2] = 1; + output_shape[3] = 1; + output_padded_shape[2] = TILE_HEIGHT; + output_padded_shape[3] = TILE_WIDTH; break; } - return {tt::tt_metal::LegacyShape(output_shape, padding)}; -} -std::vector Reduce::create_output_tensors(const std::vector& input_tensors) const { - const auto& input_tensor = input_tensors.at(0); - if (this->output_mem_config.is_sharded()) { - auto output_shape = this->compute_output_shapes(input_tensors).at(0); + auto output_mem_config = this->output_mem_config; + if (output_mem_config.is_sharded()) { auto shard_spec = input_tensor.shard_spec().value(); // TODO: This will segfault if input is not sharded... - // TODO: For reduction along H, the shard height is always 1 padded up to 32 (tile height) - // Need to clean this up to have new layout account for sharding with padding - shard_spec.shape[0] = tt_metal::compute_volume(output_shape) / output_shape[-1]; - auto mem_config = this->output_mem_config; - mem_config.shard_spec = shard_spec; - return { - create_device_tensor(output_shape, this->output_dtype, Layout::TILE, input_tensor.device(), mem_config)}; - } else { - return operation::generic_create_output_tensors( - *this, input_tensors, this->output_dtype, Layout::TILE, this->output_mem_config); + shard_spec.shape[0] = output_padded_shape.volume() / output_padded_shape[-1]; + output_mem_config.shard_spec = shard_spec; } + + return {ttnn::TensorSpec(output_shape, TensorLayout::fromLegacyPaddedShape(this->output_dtype, PageConfig(Layout::TILE), output_mem_config, ttnn::Shape(output_shape.view(), output_padded_shape.view())))}; } operation::ProgramWithCallbacks Reduce::create_program( diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp index 2f7dfbdabd6..c53598910a1 100644 --- a/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/reduce_op.hpp @@ -30,8 +30,7 @@ struct Reduce { ttnn::DeviceComputeKernelConfig compute_kernel_config; void validate(const std::vector &input_tensors) const; - std::vector compute_output_shapes(const std::vector &input_tensors) const; - std::vector create_output_tensors(const std::vector &input_tensors) const; + std::vector compute_output_specs(const std::vector &input_tensors) const; operation::ProgramWithCallbacks create_program(const std::vector& input_tensors, std::vector &output_tensors) const; ReduceOpParallelizationStrategy get_parallelization_strategy(const std::vector& input_tensors) const; }; diff --git a/ttnn/cpp/ttnn/run_operation.cpp b/ttnn/cpp/ttnn/run_operation.cpp index 40e0f3a44ab..3d03485a101 100644 --- a/ttnn/cpp/ttnn/run_operation.cpp +++ b/ttnn/cpp/ttnn/run_operation.cpp @@ -298,18 +298,31 @@ template OptionalTensors run_without_autoformat( uint8_t cq_id); std::vector extract_legacy_shapes( - const std::variant, std::vector>&& shapes, const std::function& layout_provider) { + const std::variant, std::vector, std::vector>&& shapes, const std::function& layout_provider, const bool use_tensor_layout_from_tensor_spec) { if (std::holds_alternative>(shapes)) { return std::get>(std::move(shapes)); + } else if (std::holds_alternative>(shapes)) { + const auto& simple_shapes = std::get>(shapes); + std::vector legacy_shapes; + legacy_shapes.reserve(simple_shapes.size()); + for (size_t idx = 0; idx < simple_shapes.size(); idx++) { + TensorLayout tensor_layout = layout_provider(idx); + legacy_shapes.emplace_back(simple_shapes[idx].view(), tensor_layout.compute_padded_shape(simple_shapes[idx]).view()); + } + return legacy_shapes; + } else if (std::holds_alternative>(shapes)) { + const auto& tensor_specs = std::get>(shapes); + std::vector legacy_shapes; + legacy_shapes.reserve(tensor_specs.size()); + for (size_t idx = 0; idx < tensor_specs.size(); idx++) { + const auto& [simple_shape, output_layout] = tensor_specs[idx]; + TensorLayout tensor_layout = use_tensor_layout_from_tensor_spec ? output_layout : layout_provider(idx); + legacy_shapes.emplace_back(simple_shape.view(), tensor_layout.compute_padded_shape(simple_shape).view()); + } + return legacy_shapes; + } else { + TT_THROW("extract_legacy_shapes only supports LegacyShape, SimpleShape, or TensorSpec"); } - const auto& simple_shapes = std::get>(shapes); - std::vector legacy_shapes; - legacy_shapes.reserve(simple_shapes.size()); - for (size_t idx = 0; idx < simple_shapes.size(); idx++) { - TensorLayout tensor_layout = layout_provider(idx); - legacy_shapes.emplace_back(simple_shapes[idx].view(), tensor_layout.compute_padded_shape(simple_shapes[idx]).view()); - } - return legacy_shapes; } // To be deprecated/removed in favor of new implementation where ops specifically request how to format inputs/outputss @@ -361,7 +374,7 @@ Tensors run_with_autoformat( auto output_shapes = extract_legacy_shapes(operation.compute_output_shapes(input_tensors), [&](size_t idx) { auto tensor = output_tensors[idx]; return TensorLayout(tensor.get_dtype(), Layout::TILE, tensor.memory_config()); - }); + }, /*use_tensor_layout_from_tensor_spec=*/ true); TT_ASSERT(output_tensors.size() == output_shapes.size()); @@ -424,7 +437,7 @@ Tensors run_with_autoformat( auto output_shapes = extract_legacy_shapes(operation.compute_output_shapes(input_tensors), [&](size_t idx) { auto tensor = output_tensors[idx]; return TensorLayout(tensor.get_dtype(), output_layouts[idx], tensor.memory_config()); - }); + }, /*use_tensor_layout_from_tensor_spec=*/ false); TT_ASSERT(output_tensors.size() == output_shapes.size()); TT_ASSERT(output_tensors.size() == output_layouts.size()); diff --git a/ttnn/cpp/ttnn/run_operation.hpp b/ttnn/cpp/ttnn/run_operation.hpp index 3286146dfd8..1305a7e4da5 100644 --- a/ttnn/cpp/ttnn/run_operation.hpp +++ b/ttnn/cpp/ttnn/run_operation.hpp @@ -17,6 +17,11 @@ namespace tt::tt_metal { namespace operation { using ttnn::operations::experimental::auto_format::FormatParams; + +// TODO: create_output_tensors should become a fully manual path with no dependency on infra +// - Pass output shapes directly +// - Move default values for output_dtype and output_mem_config inside ops +// - This function becomes just a regular helper function template auto generic_create_output_tensors( const ConcreteOperation& operation, diff --git a/ttnn/cpp/ttnn/tensor/tensor.hpp b/ttnn/cpp/ttnn/tensor/tensor.hpp index 2aa903fad62..8291cba455d 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor.hpp @@ -368,5 +368,6 @@ bool validate_worker_modes(const std::vector &workers); namespace ttnn { using Tensor = tt::tt_metal::Tensor; +using TensorSpec = std::pair; } // namespace ttnn