diff --git a/.github/workflows/build-and-unit-tests.yaml b/.github/workflows/build-and-unit-tests.yaml index 6a795e82620..b5ad5ef3e3a 100644 --- a/.github/workflows/build-and-unit-tests.yaml +++ b/.github/workflows/build-and-unit-tests.yaml @@ -30,7 +30,7 @@ jobs: - uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@main with: token: ${{ secrets.CHECKOUT_TOKEN }} - - name: Set up dyanmic env vars for build + - name: Set up dynamic env vars for build run: | echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - name: Build tt-metal and libs diff --git a/tests/ttnn/sweep_tests/README.md b/tests/ttnn/sweep_tests/README.md index cb956903b56..9696a9fd0b5 100644 --- a/tests/ttnn/sweep_tests/README.md +++ b/tests/ttnn/sweep_tests/README.md @@ -7,15 +7,51 @@ python tests/ttnn/sweep_tests/run_all_tests.py ## Printing report of all sweeps ``` -python tests/ttnn/sweep_tests/print_report.py +python tests/ttnn/sweep_tests/print_report.py [--detailed] ``` ## Debugging sweeps ``` -python tests/ttnn/sweep_tests/run_failed_and_crashed_tests.py [--exclude add,linear] +python tests/ttnn/sweep_tests/run_failed_and_crashed_tests.py [--exclude add,linear] [--stepwise] ``` ## Running a single test ``` python tests/ttnn/sweep_tests/run_single_test.py --test-name add --index 0 ``` + +## Adding a new sweep test +In `tests/ttnn/sweep_tests/sweeps` add a new file `.py`. + +The file must contain: +- `parameters` dictionary from a variable to the list of values to sweep +- `skip` function for filtering out unwanted combinations. It should return `bool` +- `run` function for running the test. It should return `Tuple[bool, Optional[str]]`. Second element of the tuple is the error message + +For example, let's add `tests/ttnn/sweep_tests/sweeps/to_and_from_device.py`: +```python + +import torch +import ttnn + +from tests.ttnn.utils_for_testing import check_with_pcc + +parameters = { + "height": [1, 32], + "width": [1, 32], +} + +def skip(height, width): + if height == 1 and width == 1: + return True + return False + +def run(height, width, *, device): + torch_tensor = torch.zeros((height, width)) + + tensor = ttnn.from_torch(torch_tensor, device=device) + tensor = ttnn.to_torch(tensor) + + return check_with_pcc(torch_tensor, tensor) + +``` diff --git a/tests/ttnn/sweep_tests/print_report.py b/tests/ttnn/sweep_tests/print_report.py index 3b648e5d93f..1050e0657e1 100644 --- a/tests/ttnn/sweep_tests/print_report.py +++ b/tests/ttnn/sweep_tests/print_report.py @@ -3,11 +3,17 @@ # SPDX-License-Identifier: Apache-2.0 +import argparse + from tests.ttnn.sweep_tests.sweep import print_report def main(): - print_report() + parser = argparse.ArgumentParser() + parser.add_argument("--detailed", action="store_true") + detailed = parser.parse_args().detailed + + print_report(detailed=detailed) if __name__ == "__main__": diff --git a/tests/ttnn/sweep_tests/run_all_tests.py b/tests/ttnn/sweep_tests/run_all_tests.py index ce035e40d95..ff630a2edb3 100644 --- a/tests/ttnn/sweep_tests/run_all_tests.py +++ b/tests/ttnn/sweep_tests/run_all_tests.py @@ -2,12 +2,15 @@ # SPDX-License-Identifier: Apache-2.0 +import ttnn from tests.ttnn.sweep_tests.sweep import run_all_tests, print_report def main(): - run_all_tests() + device = ttnn.open(0) + run_all_tests(device=device) + ttnn.close(device) print_report() diff --git a/tests/ttnn/sweep_tests/run_failed_and_crashed_tests.py b/tests/ttnn/sweep_tests/run_failed_and_crashed_tests.py index a3509f9eb1d..9fcdb3ee02e 100644 --- a/tests/ttnn/sweep_tests/run_failed_and_crashed_tests.py +++ b/tests/ttnn/sweep_tests/run_failed_and_crashed_tests.py @@ -10,16 +10,27 @@ from tests.ttnn.sweep_tests.sweep import run_failed_and_crashed_tests +def parse_exclude_string(exclude): + if exclude is None: + exclude = [] + else: + exclude = exclude.split(",") + exclude = [test_name.strip() for test_name in exclude] + return set(exclude) + + def main(): parser = argparse.ArgumentParser() parser.add_argument("--exclude", type=str) + parser.add_argument("--stepwise", action="store_true") exclude = parser.parse_args().exclude - exclude = exclude.split(",") - exclude = [test_name.strip() for test_name in exclude] + stepwise = parser.parse_args().stepwise + + exclude = parse_exclude_string(exclude) device = ttnn.open(0) - run_failed_and_crashed_tests(device=device, exclude=exclude) + run_failed_and_crashed_tests(device=device, stepwise=stepwise, exclude=exclude) ttnn.close(device) diff --git a/tests/ttnn/sweep_tests/sweep.py b/tests/ttnn/sweep_tests/sweep.py index 632fdc3a24a..dd014f38321 100644 --- a/tests/ttnn/sweep_tests/sweep.py +++ b/tests/ttnn/sweep_tests/sweep.py @@ -102,7 +102,7 @@ def sweep(sweep_file_name, run, skip, parameters, *, device): def _run_single_test(run, skip, parameters, index, *, device): permutation = list(permutations(parameters))[index] pretty_printed_parameters = ",\n".join(f"\t{key}={value}" for key, value in permutation.items()) - logger.info(f"Reproducing sweep results at index {index}:\n{{{pretty_printed_parameters}}}") + logger.info(f"Running sweep test at index {index}:\n{{{pretty_printed_parameters}}}") if skip(**permutation): return "skipped", None passed, message = run(**permutation, device=device) @@ -130,42 +130,19 @@ def run_single_test(test_name, index, *, device): return status, message -def run_all_tests(): +def run_all_tests(*, device): logger.info(f"Deleting old sweep results in {SWEEP_RESULTS_DIR}") if SWEEP_RESULTS_DIR.exists(): for file_name in SWEEP_RESULTS_DIR.glob("*.csv"): file_name.unlink() - device = ttnn.open(0) for file_name in sorted(SWEEP_SOURCES_DIR.glob("*.py")): logger.info(f"Running {file_name}") sweep_module = SourceFileLoader("sweep_module", str(file_name)).load_module() sweep(file_name, sweep_module.run, sweep_module.skip, sweep_module.parameters, device=device) - ttnn.close(device) -def print_report(): - stats_df = pd.DataFrame(columns=["name", "passed", "failed", "skipped", "crashed"]) - - def add_row(df, name): - df.loc[-1] = [name, 0, 0, 0, 0] - df.index = df.index + 1 - df.reset_index(inplace=True, drop=True) - return df - - for file_name in sorted(SWEEP_RESULTS_DIR.glob("*.csv")): - df = pd.read_csv(file_name) - stats_df = add_row(stats_df, file_name.stem) - for status in stats_df.columns[1:]: - stats_df.at[len(stats_df) - 1, status] = (df["status"] == status).sum() - - stats_df = add_row(stats_df, "total") - stats_df.loc[len(stats_df) - 1, stats_df.columns[1:]] = stats_df[stats_df.columns[1:]].sum() - - print(stats_df) - - -def run_failed_and_crashed_tests(*, device, exclude): +def run_failed_and_crashed_tests(*, device, stepwise, exclude): keep_running = True for file_name in sorted(SWEEP_RESULTS_DIR.glob("*.csv")): test_name = file_name.stem @@ -185,13 +162,56 @@ def run_failed_and_crashed_tests(*, device, exclude): if row.status not in {"failed", "crashed"}: continue - status, _ = run_single_test(file_name.stem, index, device=device) + status, message = run_single_test(file_name.stem, index, device=device) logger.info(status) - if status in {"failed", "crashed"}: + if status in {"failed", "crashed"} and stepwise: keep_running = False break df.at[index, "status"] = status - df.at[index, "message"] = None + df.at[index, "message"] = message df.to_csv(file_name) + + +def print_summary(): + stats_df = pd.DataFrame(columns=["name", "passed", "failed", "skipped", "crashed"]) + + def add_row(df, name): + df.loc[-1] = [name, 0, 0, 0, 0] + df.index = df.index + 1 + df.reset_index(inplace=True, drop=True) + return df + + for file_name in sorted(SWEEP_RESULTS_DIR.glob("*.csv")): + df = pd.read_csv(file_name) + stats_df = add_row(stats_df, file_name.stem) + for status in stats_df.columns[1:]: + stats_df.at[len(stats_df) - 1, status] = (df["status"] == status).sum() + + stats_df = add_row(stats_df, "total") + stats_df.loc[len(stats_df) - 1, stats_df.columns[1:]] = stats_df[stats_df.columns[1:]].sum() + + print(stats_df) + + +def print_detailed_report(): + for file_name in sorted(SWEEP_RESULTS_DIR.glob("*.csv")): + name = file_name.stem + df = pd.read_csv(file_name) + for index, row in enumerate(df.itertuples()): + if row.status in {"failed", "crashed"}: + print(f"{name}@{index}: {row.status}") + print(f"\t{row.exception}") + elif row.status == "skipped": + print(f"{name}@{index}: {row.status}") + else: + print(f"{name}@{index}: {row.status}") + print() + + +def print_report(*, detailed=False): + if detailed: + print_detailed_report() + else: + print_summary() diff --git a/tests/ttnn/sweep_tests/sweeps/linear.py b/tests/ttnn/sweep_tests/sweeps/linear.py index 46bc460883e..52ceebd88fb 100644 --- a/tests/ttnn/sweep_tests/sweeps/linear.py +++ b/tests/ttnn/sweep_tests/sweeps/linear.py @@ -55,7 +55,9 @@ def run( torch_bias = torch_random((n_size,), -0.1, 0.1, dtype=torch.float32) else: torch_bias = None - torch_output_tensor = torch.nn.functional.linear(torch_input_tensor_a, torch_input_tensor_b, bias=torch_bias) + torch_output_tensor = torch.nn.functional.linear( + torch_input_tensor_a, torch_input_tensor_b.T.contiguous(), bias=torch_bias + ) input_tensor_a = ttnn.from_torch( torch_input_tensor_a, diff --git a/tests/ttnn/sweep_tests/sweeps/unary.py b/tests/ttnn/sweep_tests/sweeps/unary.py index c507d2aeffc..4e0d8bec8e0 100644 --- a/tests/ttnn/sweep_tests/sweeps/unary.py +++ b/tests/ttnn/sweep_tests/sweeps/unary.py @@ -45,7 +45,12 @@ def run( ): input_shape = (*batch_sizes, height, width) - torch_input_tensor = torch_random(input_shape, -0.1, 0.1, dtype=torch.float32) + low = -0.1 + high = 0.1 + if ttnn_function in {ttnn.rsqrt}: + low = 0.0 + + torch_input_tensor = torch_random(input_shape, low, high, dtype=torch.float32) torch_output_tensor = torch_function(torch_input_tensor) input_tensor = ttnn.from_torch( diff --git a/ttnn/core.py b/ttnn/core.py index 430784d2aea..0198888df3c 100644 --- a/ttnn/core.py +++ b/ttnn/core.py @@ -1276,16 +1276,20 @@ def softmax( input_tensor = ttnn.unsqueeze_to_4D(input_tensor) - ttl_input_tensor = input_tensor.value is_padded_and_using_tile = ( input_tensor.layout == ttnn.TILE_LAYOUT and list(input_tensor.shape)[-2:] != list(input_tensor.shape.padded())[-2:] ) if not is_padded_and_using_tile and dim == rank - 1: + ttl_input_tensor = input_tensor.value # TODO: #4599 Research why softmax appears to not be stable when using a padded ttnn.TILE_LAYOUT ttl_output_tensor = ttl.tensor.softmax(ttl_input_tensor, output_mem_config=memory_config) else: dim_4D = dim + 4 - rank + + input_tensor = ttnn.to_layout(input_tensor, ttnn.TILE_LAYOUT) + ttl_input_tensor = input_tensor.value + ttl_output_tensor = ttl.operations.primary.moreh_softmax( ttl_input_tensor, dim=dim_4D, output_mem_config=memory_config )