From 3c721b212c144f7c7dec6aa8538ee901d1077b0f Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 Dec 2024 18:20:19 -0800 Subject: [PATCH 1/4] HW to add SBs to North I/O tiles; working lopsided CGRA (with hacks) --- aha/util/regress.py | 9 +++++++-- canal | 2 +- garnet | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/aha/util/regress.py b/aha/util/regress.py index 4bee9e02..d386e157 100644 --- a/aha/util/regress.py +++ b/aha/util/regress.py @@ -93,6 +93,7 @@ def gen_garnet(width, height, dense_only=False, using_matrix_unit=False, mu_data buildkite_args.append("--using-matrix-unit") buildkite_args.append("--mu-datawidth") buildkite_args.append(str(mu_datawidth)) + buildkite_args.append("--give-north-io-sbs") buildkite_call(buildkite_args) @@ -133,6 +134,7 @@ def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile build_tb_cmd.append("--opal-workaround") if using_matrix_unit: build_tb_cmd.append("--using-matrix-unit") + build_tb_cmd.append("--give-north-io-sbs") buildkite_call( build_tb_cmd, env=env_vars, @@ -166,6 +168,7 @@ def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile build_tb_cmd.append("--opal-workaround") if using_matrix_unit: build_tb_cmd.append("--using-matrix-unit") + build_tb_cmd.append("--give-north-io-sbs") buildkite_call( build_tb_cmd, env=env_vars, @@ -342,7 +345,8 @@ def test_dense_app(test, width, height, env_parameters, extra_args, layer=None, if using_matrix_unit: buildkite_args.append("--using-matrix-unit") - env_vars["WEST_IN_IO_SIDES"] = "1" + buildkite_args.append("--give-north-io-sbs") + #env_vars["WEST_IN_IO_SIDES"] = "1" env_vars["USING_MATRIX_UNIT"] = "1" env_vars["OC_0"] = str(2*cgra_height) env_vars["MU_DATAWIDTH"] = str(mu_datawidth) @@ -431,6 +435,7 @@ def test_hardcoded_dense_app(test, width, height, env_parameters, extra_args, la if using_matrix_unit: buildkite_args.append("--using-matrix-unit") + buildkite_args.append("--give-north-io-sbs") env_vars["WEST_IN_IO_SIDES"] = "1" env_vars["USING_MATRIX_UNIT"] = "1" env_vars["OC_0"] = str(2*cgra_height) @@ -619,4 +624,4 @@ def dispatch(args, extra_args=None): def gather_tests(tags): - pass + pass \ No newline at end of file diff --git a/canal b/canal index 420d831f..e4519748 160000 --- a/canal +++ b/canal @@ -1 +1 @@ -Subproject commit 420d831f0e22156af0268f99c891f43f812a50a6 +Subproject commit e4519748a276db4ba8ded8c37cf25dedf51695c3 diff --git a/garnet b/garnet index fef07782..2b82ad99 160000 --- a/garnet +++ b/garnet @@ -1 +1 @@ -Subproject commit fef0778274597afa61275a02508c0a57839d613a +Subproject commit 2b82ad99d282d3eabf3ccb462cea3211b465c3a1 From 66b5fcc18c72652914907ca15ec814e040d1e7cf Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 Dec 2024 10:16:41 -0800 Subject: [PATCH 2/4] Remove fabric cols: checkpoint 1 --- aha/util/regress.py | 83 ++++++++++++++++++++++++++++++++------------- canal | 2 +- garnet | 2 +- 3 files changed, 61 insertions(+), 26 deletions(-) diff --git a/aha/util/regress.py b/aha/util/regress.py index d386e157..be946544 100644 --- a/aha/util/regress.py +++ b/aha/util/regress.py @@ -26,6 +26,7 @@ def add_subparser(subparser): parser.add_argument("--unroll", default=1, type=int) parser.add_argument("--using-matrix-unit", action="store_true") parser.add_argument("--mu-datawidth", default=16, type=int) + parser.add_argument("--num-fabric-cols-removed", default=0, type=int) parser.set_defaults(dispatch=dispatch) @@ -68,7 +69,7 @@ def buildkite_call(command, env={}, return_output=False, out_file=None): else: raise -def gen_garnet(width, height, dense_only=False, using_matrix_unit=False, mu_datawidth=16): +def gen_garnet(width, height, dense_only=False, using_matrix_unit=False, mu_datawidth=16, num_fabric_cols_removed=0): print("--- Generating Garnet", flush=True) start = time.time() if not os.path.exists("/aha/garnet/garnet.v"): @@ -94,13 +95,15 @@ def gen_garnet(width, height, dense_only=False, using_matrix_unit=False, mu_data buildkite_args.append("--mu-datawidth") buildkite_args.append(str(mu_datawidth)) buildkite_args.append("--give-north-io-sbs") + buildkite_args.append("--num-fabric-cols-removed") + buildkite_args.append(str(num_fabric_cols_removed)) buildkite_call(buildkite_args) return time.time() - start -def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=False, unroll=1, using_matrix_unit=False): +def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=False, unroll=1, using_matrix_unit=False, num_fabric_cols_removed=0): if len(sparse_tests) == 0: return 0 @@ -135,6 +138,8 @@ def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile if using_matrix_unit: build_tb_cmd.append("--using-matrix-unit") build_tb_cmd.append("--give-north-io-sbs") + build_tb_cmd.append("--num-fabric-cols-removed") + build_tb_cmd.append(str(num_fabric_cols_removed)) buildkite_call( build_tb_cmd, env=env_vars, @@ -169,6 +174,8 @@ def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile if using_matrix_unit: build_tb_cmd.append("--using-matrix-unit") build_tb_cmd.append("--give-north-io-sbs") + build_tb_cmd.append("--num-fabric-cols-removed") + build_tb_cmd.append(str(num_fabric_cols_removed)) buildkite_call( build_tb_cmd, env=env_vars, @@ -215,7 +222,7 @@ def format_concat_tiles(test, data_tile_pairs, kernel_name, pipeline_num=32, unr return all_tiles, num_list -def test_sparse_app(testname, seed_flow, data_tile_pairs, pipeline_num_l=None, opal_workaround=False, test="", test_dataset_runtime_dict=None, using_matrix_unit=False, cgra_height=32, mu_datawidth=16): +def test_sparse_app(testname, seed_flow, data_tile_pairs, pipeline_num_l=None, opal_workaround=False, test="", test_dataset_runtime_dict=None, using_matrix_unit=False, cgra_height=32, mu_datawidth=16, num_fabric_cols_removed=0): if test == "": test = testname @@ -223,7 +230,8 @@ def test_sparse_app(testname, seed_flow, data_tile_pairs, pipeline_num_l=None, o env_vars = {"PYTHONPATH": "/aha/garnet/"} if using_matrix_unit: - env_vars["WEST_IN_IO_SIDES"] = "1" + if num_fabric_cols_removed == 0: + env_vars["WEST_IN_IO_SIDES"] = "1" env_vars["USING_MATRIX_UNIT"] = "1" env_vars["OC_0"] = str(2*cgra_height) env_vars["MU_DATAWIDTH"] = str(mu_datawidth) @@ -297,7 +305,7 @@ def test_sparse_app(testname, seed_flow, data_tile_pairs, pipeline_num_l=None, o return 0, 0, time_test -def test_dense_app(test, width, height, env_parameters, extra_args, layer=None, dense_only=False, use_fp=False, using_matrix_unit=False, cgra_height=32, mu_datawidth=16): +def test_dense_app(test, width, height, env_parameters, extra_args, layer=None, dense_only=False, use_fp=False, using_matrix_unit=False, cgra_height=32, mu_datawidth=16, num_fabric_cols_removed=0): env_parameters = str(env_parameters) testname = layer if layer is not None else test print(f"--- {testname}") @@ -346,7 +354,12 @@ def test_dense_app(test, width, height, env_parameters, extra_args, layer=None, if using_matrix_unit: buildkite_args.append("--using-matrix-unit") buildkite_args.append("--give-north-io-sbs") - #env_vars["WEST_IN_IO_SIDES"] = "1" + buildkite_args.append("--num-fabric-cols-removed") + buildkite_args.append(str(num_fabric_cols_removed)) + + if num_fabric_cols_removed == 0: + env_vars["WEST_IN_IO_SIDES"] = "1" + env_vars["USING_MATRIX_UNIT"] = "1" env_vars["OC_0"] = str(2*cgra_height) env_vars["MU_DATAWIDTH"] = str(mu_datawidth) @@ -366,7 +379,7 @@ def test_dense_app(test, width, height, env_parameters, extra_args, layer=None, return time_compile, time_map, time_test -def test_hardcoded_dense_app(test, width, height, env_parameters, extra_args, layer=None, dense_only=False, using_matrix_unit=False, cgra_height=32, mu_datawidth=16): +def test_hardcoded_dense_app(test, width, height, env_parameters, extra_args, layer=None, dense_only=False, using_matrix_unit=False, cgra_height=32, mu_datawidth=16, num_fabric_cols_removed=0): env_parameters = str(env_parameters) testname = layer if layer is not None else test print(f"--- {testname}") @@ -436,7 +449,10 @@ def test_hardcoded_dense_app(test, width, height, env_parameters, extra_args, la if using_matrix_unit: buildkite_args.append("--using-matrix-unit") buildkite_args.append("--give-north-io-sbs") - env_vars["WEST_IN_IO_SIDES"] = "1" + + if num_fabric_cols_removed == 0: + env_vars["WEST_IN_IO_SIDES"] = "1" + env_vars["USING_MATRIX_UNIT"] = "1" env_vars["OC_0"] = str(2*cgra_height) env_vars["MU_DATAWIDTH"] = str(mu_datawidth) @@ -459,8 +475,13 @@ def dispatch(args, extra_args=None): pipeline_num = args.pipeline_num using_matrix_unit = args.using_matrix_unit mu_datawidth = args.mu_datawidth + num_fabric_cols_removed = args.num_fabric_cols_removed unroll = args.unroll + # Can only remove col that is multiple of 4 + assert num_fabric_cols_removed % 4 == 0, "ERROR: Number of cols removed must be a multiple of 4" + assert num_fabric_cols_removed <= 8, "ERROR: Removing more than 8 columns is not supported yet. Hardware modifications may be necessary to proceed." + # Preserve backward compatibility if args.config == "daily": args.config = "pr_aha" # noqa if args.config == "pr": args.config = "pr_submod" # noqa @@ -504,7 +525,7 @@ def dispatch(args, extra_args=None): print(f"--- Running regression: {args.config}", flush=True) info = [] - t = gen_garnet(width, height, dense_only=False, using_matrix_unit=using_matrix_unit, mu_datawidth=mu_datawidth) + t = gen_garnet(width, height, dense_only=False, using_matrix_unit=using_matrix_unit, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append(["garnet with sparse and dense", t]) data_tile_pairs = [] @@ -531,16 +552,19 @@ def dispatch(args, extra_args=None): print("HERE ARE THE DATA TILE PAIRS!") print(data_tile_pairs) - generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=args.opal_workaround, unroll=unroll, using_matrix_unit=using_matrix_unit) + generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, + opal_workaround=args.opal_workaround, unroll=unroll, using_matrix_unit=using_matrix_unit, num_fabric_cols_removed=num_fabric_cols_removed) for test in sparse_tests: if use_pipeline: assert (not seed_flow), "Pipeline mode is not supported with seed flow" tile_pairs, pipeline_num_l = format_concat_tiles(test, data_tile_pairs, kernel_name, pipeline_num, unroll) - t0, t1, t2 = test_sparse_app(test, seed_flow, tile_pairs, pipeline_num_l, opal_workaround=args.opal_workaround, test_dataset_runtime_dict=test_dataset_runtime_dict, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + t0, t1, t2 = test_sparse_app(test, seed_flow, tile_pairs, pipeline_num_l, opal_workaround=args.opal_workaround, test_dataset_runtime_dict=test_dataset_runtime_dict, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) else: - t0, t1, t2 = test_sparse_app(test, seed_flow, data_tile_pairs, opal_workaround=args.opal_workaround, test_dataset_runtime_dict=test_dataset_runtime_dict, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + t0, t1, t2 = test_sparse_app(test, seed_flow, data_tile_pairs, opal_workaround=args.opal_workaround, test_dataset_runtime_dict=test_dataset_runtime_dict, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) # remove the generated collateral for tiles that passed to avoid overrunning the disk @@ -552,46 +576,55 @@ def dispatch(args, extra_args=None): for dataset, time_value in dataset_runtime_dict.items(): perf_out_file.write(f"{testname} {dataset} {time_value}\n") else: - generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=args.opal_workaround, unroll=unroll, using_matrix_unit=using_matrix_unit) + generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, + opal_workaround=args.opal_workaround, unroll=unroll, using_matrix_unit=using_matrix_unit, num_fabric_cols_removed=num_fabric_cols_removed) for test in sparse_tests: assert(not use_pipeline), "Pipeline mode is not supported with seed flow" - t0, t1, t2 = test_sparse_app(test, seed_flow, data_tile_pairs, opal_workaround=args.opal_workaround, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + t0, t1, t2 = test_sparse_app(test, seed_flow, data_tile_pairs, opal_workaround=args.opal_workaround, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) for test in glb_tests: t0, t1, t2 = test_dense_app(test, - width, height, args.env_parameters, extra_args, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) for test in glb_tests_fp: t0, t1, t2 = test_dense_app(test, - width, height, args.env_parameters, extra_args, use_fp=True, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, use_fp=True, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) for test in resnet_tests: if "residual" in test: t0, t1, t2 = test_dense_app("apps/resnet_residual", - width, height, args.env_parameters, extra_args, layer=test, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, layer=test, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) else: t0, t1, t2 = test_dense_app("apps/resnet_output_stationary", - width, height, args.env_parameters, extra_args, layer=test, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, layer=test, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) for test in resnet_tests_fp: if "residual" in test: t0, t1, t2 = test_dense_app("apps/conv2D_residual_fp", - width, height, args.env_parameters, extra_args, layer=test, use_fp=True, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, layer=test, use_fp=True, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) else: t0, t1, t2 = test_dense_app("apps/conv2D_fp", - width, height, args.env_parameters, extra_args, layer=test, use_fp=True, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, layer=test, use_fp=True, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) for test in hardcoded_dense_tests: t0, t1, t2 = test_hardcoded_dense_app(test, - width, height, args.env_parameters, extra_args, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2]) if args.include_dense_only_tests: @@ -601,7 +634,7 @@ def dispatch(args, extra_args=None): if os.WEXITSTATUS(exit_status) != 0: raise RuntimeError(f"Command 'rm /aha/garnet/garnet.v' returned non-zero exit status {os.WEXITSTATUS(exit_status)}.") - t = gen_garnet(width, height, dense_only=True, using_matrix_unit=using_matrix_unit, mu_datawidth=mu_datawidth) + t = gen_garnet(width, height, dense_only=True, using_matrix_unit=using_matrix_unit, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append(["garnet with dense only", t]) num_dense_only_glb_tests = 5 @@ -609,14 +642,16 @@ def dispatch(args, extra_args=None): if test_index == num_dense_only_glb_tests: break t0, t1, t2 = test_dense_app(test, - width, height, args.env_parameters, extra_args, dense_only=True, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, dense_only=True, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb dense only", t0 + t1 + t2, t0, t1, t2]) for test in resnet_tests: # residual resnet test is not working with dense only mode if "residual" not in test: t0, t1, t2 = test_dense_app("apps/resnet_output_stationary", - width, height, args.env_parameters, extra_args, layer=test, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth) + width, height, args.env_parameters, extra_args, layer=test, + using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth, num_fabric_cols_removed=num_fabric_cols_removed) info.append([test + "_glb dense only", t0 + t1 + t2, t0, t1, t2]) print(f"+++ TIMING INFO", flush=True) diff --git a/canal b/canal index e4519748..01d283b7 160000 --- a/canal +++ b/canal @@ -1 +1 @@ -Subproject commit e4519748a276db4ba8ded8c37cf25dedf51695c3 +Subproject commit 01d283b7d5af68d5a5675268fbc7e32552be03cc diff --git a/garnet b/garnet index 2b82ad99..56f164f1 160000 --- a/garnet +++ b/garnet @@ -1 +1 @@ -Subproject commit 2b82ad99d282d3eabf3ccb462cea3211b465c3a1 +Subproject commit 56f164f19565a80c165fba8bc551f723f250c1d7 From 056280a425faf8f2b734b92631198ffc372a1049 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 Dec 2024 13:03:04 -0800 Subject: [PATCH 3/4] Bump garnet --- garnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/garnet b/garnet index 56f164f1..61f2bd96 160000 --- a/garnet +++ b/garnet @@ -1 +1 @@ -Subproject commit 56f164f19565a80c165fba8bc551f723f250c1d7 +Subproject commit 61f2bd96d94099a38e2bde62fe2c4e1a3b043b33 From a07df0a93b1e54f7d0268f5e96de3efd59f192e9 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 10 Dec 2024 10:40:31 -0800 Subject: [PATCH 4/4] Bump canal --- canal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/canal b/canal index 01d283b7..f201df6b 160000 --- a/canal +++ b/canal @@ -1 +1 @@ -Subproject commit 01d283b7d5af68d5a5675268fbc7e32552be03cc +Subproject commit f201df6b0aaaf931e0d2ecc8d76c781f4c708c91