Merge pull request #1960 from StanfordAHA/hetero-cgra

MU2F I/O tiles
StanfordAHA · Dec 3, 2024 · 40a9ca2 · 40a9ca2
2 parents 129bebe + 66e2f60
commit 40a9ca2
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 29 deletions.
diff --git a/aha/util/regress.py b/aha/util/regress.py
@@ -24,6 +24,8 @@ def add_subparser(subparser):
     parser.add_argument("--pipeline-num", default=32, type=int)
     parser.add_argument("--sparse-tile-pairs-list", default="", type=str, nargs="*")
     parser.add_argument("--unroll", default=1, type=int)
+    parser.add_argument("--using-matrix-unit", action="store_true")
+    parser.add_argument("--mu-datawidth", default=16, type=int)
     parser.set_defaults(dispatch=dispatch)
 
 
@@ -66,15 +68,14 @@ def buildkite_call(command, env={}, return_output=False, out_file=None):
             else:
                 raise
 
-def gen_garnet(width, height, dense_only=False):
+def gen_garnet(width, height, dense_only=False, using_matrix_unit=False, mu_datawidth=16):
     print("--- Generating Garnet", flush=True)
     start = time.time()
     if not os.path.exists("/aha/garnet/garnet.v"):
         # Daemon is no good if/when we build new/different verilog
         buildkite_call("aha garnet --daemon kill".split())
 
         # No garnet verilog yet, so build it now.
-
         buildkite_args = [
                             "aha",
                             "garnet",
@@ -88,12 +89,17 @@ def gen_garnet(width, height, dense_only=False):
         if dense_only:
             buildkite_args.append("--dense-only")
 
+        if using_matrix_unit:
+            buildkite_args.append("--using-matrix-unit")
+            buildkite_args.append("--mu-datawidth")
+            buildkite_args.append(str(mu_datawidth))
+
         buildkite_call(buildkite_args)
 
     return time.time() - start
 
 
-def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=False, unroll=1):
+def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=False, unroll=1, using_matrix_unit=False):
     if len(sparse_tests) == 0:
         return 0
 
@@ -125,6 +131,8 @@ def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile
         ]
         if opal_workaround:
             build_tb_cmd.append("--opal-workaround")
+        if using_matrix_unit:
+            build_tb_cmd.append("--using-matrix-unit")
         buildkite_call(
             build_tb_cmd,
             env=env_vars,
@@ -156,6 +164,8 @@ def generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile
         ]
         if opal_workaround:
             build_tb_cmd.append("--opal-workaround")
+        if using_matrix_unit:
+            build_tb_cmd.append("--using-matrix-unit")
         buildkite_call(
             build_tb_cmd,
             env=env_vars,
@@ -202,13 +212,18 @@ def format_concat_tiles(test, data_tile_pairs, kernel_name, pipeline_num=32, unr
     return all_tiles, num_list
 
 
-def test_sparse_app(testname, seed_flow, data_tile_pairs, pipeline_num_l=None, opal_workaround=False, test="", test_dataset_runtime_dict=None):
+def test_sparse_app(testname, seed_flow, data_tile_pairs, pipeline_num_l=None, opal_workaround=False, test="", test_dataset_runtime_dict=None, using_matrix_unit=False, cgra_height=32, mu_datawidth=16):
     if test == "":
         test = testname
 
     print(f"--- {test}")
 
     env_vars = {"PYTHONPATH": "/aha/garnet/"}
+    if using_matrix_unit:
+        env_vars["WEST_IN_IO_SIDES"] = "1"
+        env_vars["USING_MATRIX_UNIT"] = "1"
+        env_vars["OC_0"] = str(2*cgra_height)
+        env_vars["MU_DATAWIDTH"] = str(mu_datawidth)
 
     app_path = f"{testname}_0/GLB_DIR/{testname}_combined_seed_0"
     print(app_path, flush=True)
@@ -279,7 +294,7 @@ def test_sparse_app(testname, seed_flow, data_tile_pairs, pipeline_num_l=None, o
     return 0, 0, time_test
 
 
-def test_dense_app(test, width, height, env_parameters, extra_args, layer=None, dense_only=False, use_fp=False):
+def test_dense_app(test, width, height, env_parameters, extra_args, layer=None, dense_only=False, use_fp=False, using_matrix_unit=False, cgra_height=32, mu_datawidth=16):
     env_parameters = str(env_parameters)
     testname = layer if layer is not None else test
     print(f"--- {testname}")
@@ -323,22 +338,31 @@ def test_dense_app(test, width, height, env_parameters, extra_args, layer=None,
     if dense_only:
         buildkite_args.append("--dense-only")
 
+    env_vars = {}
+
+    if using_matrix_unit:
+        buildkite_args.append("--using-matrix-unit")
+        env_vars["WEST_IN_IO_SIDES"] = "1"
+        env_vars["USING_MATRIX_UNIT"] = "1"
+        env_vars["OC_0"] = str(2*cgra_height)
+        env_vars["MU_DATAWIDTH"] = str(mu_datawidth)
+
     buildkite_call(buildkite_args)
 
     time_map = time.time() - start
 
     print(f"--- {testname} - glb testing", flush=True)
     start = time.time()
     if use_fp:
-        buildkite_call(["aha", "test", test, "--dense-fp"])
+        buildkite_call(["aha", "test", test, "--dense-fp"], env=env_vars)
     else:
-        buildkite_call(["aha", "test", test])
+        buildkite_call(["aha", "test", test], env=env_vars)
     time_test = time.time() - start
 
     return time_compile, time_map, time_test
 
 
-def test_hardcoded_dense_app(test, width, height, env_parameters, extra_args, layer=None, dense_only=False):
+def test_hardcoded_dense_app(test, width, height, env_parameters, extra_args, layer=None, dense_only=False, using_matrix_unit=False, cgra_height=32, mu_datawidth=16):
     env_parameters = str(env_parameters)
     testname = layer if layer is not None else test
     print(f"--- {testname}")
@@ -401,14 +425,23 @@ def test_hardcoded_dense_app(test, width, height, env_parameters, extra_args, la
 
     if dense_only:
         buildkite_args.append("--dense-only")
+
+    env_vars = {}
+
+    if using_matrix_unit:
+        buildkite_args.append("--using-matrix-unit")
+        env_vars["WEST_IN_IO_SIDES"] = "1"
+        env_vars["USING_MATRIX_UNIT"] = "1"
+        env_vars["OC_0"] = str(2*cgra_height)
+        env_vars["MU_DATAWIDTH"] = str(mu_datawidth)
 
     buildkite_call(buildkite_args)
 
     time_map = time.time() - start
 
     print(f"--- {testname} - glb testing", flush=True)
     start = time.time()
-    buildkite_call(["aha", "test", test])
+    buildkite_call(["aha", "test", test], env=env_vars)
     time_test = time.time() - start
 
     return time_compile, time_map, time_test
@@ -418,6 +451,8 @@ def dispatch(args, extra_args=None):
     seed_flow = not args.non_seed_flow
     use_pipeline = args.use_pipeline
     pipeline_num = args.pipeline_num
+    using_matrix_unit = args.using_matrix_unit
+    mu_datawidth = args.mu_datawidth
     unroll = args.unroll
 
     # Preserve backward compatibility
@@ -463,7 +498,7 @@ def dispatch(args, extra_args=None):
 
     print(f"--- Running regression: {args.config}", flush=True)
     info = []
-    t = gen_garnet(width, height, dense_only=False)
+    t = gen_garnet(width, height, dense_only=False, using_matrix_unit=using_matrix_unit, mu_datawidth=mu_datawidth)
     info.append(["garnet with sparse and dense", t])
 
     data_tile_pairs = []
@@ -490,16 +525,16 @@ def dispatch(args, extra_args=None):
             print("HERE ARE THE DATA TILE PAIRS!")
             print(data_tile_pairs)
 
-            generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=args.opal_workaround, unroll=unroll)
+            generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=args.opal_workaround, unroll=unroll, using_matrix_unit=using_matrix_unit)
 
             for test in sparse_tests:
                 if use_pipeline:
                     assert (not seed_flow), "Pipeline mode is not supported with seed flow"
                     tile_pairs, pipeline_num_l = format_concat_tiles(test, data_tile_pairs, kernel_name, pipeline_num, unroll)
-                    t0, t1, t2 = test_sparse_app(test, seed_flow, tile_pairs, pipeline_num_l, opal_workaround=args.opal_workaround, test_dataset_runtime_dict=test_dataset_runtime_dict)
+                    t0, t1, t2 = test_sparse_app(test, seed_flow, tile_pairs, pipeline_num_l, opal_workaround=args.opal_workaround, test_dataset_runtime_dict=test_dataset_runtime_dict, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
                     info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
                 else:
-                    t0, t1, t2 = test_sparse_app(test, seed_flow, data_tile_pairs, opal_workaround=args.opal_workaround, test_dataset_runtime_dict=test_dataset_runtime_dict)
+                    t0, t1, t2 = test_sparse_app(test, seed_flow, data_tile_pairs, opal_workaround=args.opal_workaround, test_dataset_runtime_dict=test_dataset_runtime_dict, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
                     info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
 
                 # remove the generated collateral for tiles that passed to avoid overrunning the disk
@@ -511,46 +546,46 @@ def dispatch(args, extra_args=None):
                 for dataset, time_value in dataset_runtime_dict.items():
                     perf_out_file.write(f"{testname}        {dataset}        {time_value}\n")   
     else:
-        generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=args.opal_workaround, unroll=unroll)
+        generate_sparse_bitstreams(sparse_tests, width, height, seed_flow, data_tile_pairs, kernel_name, opal_workaround=args.opal_workaround, unroll=unroll, using_matrix_unit=using_matrix_unit)
 
         for test in sparse_tests:
             assert(not use_pipeline), "Pipeline mode is not supported with seed flow"
-            t0, t1, t2 = test_sparse_app(test, seed_flow, data_tile_pairs, opal_workaround=args.opal_workaround)
+            t0, t1, t2 = test_sparse_app(test, seed_flow, data_tile_pairs, opal_workaround=args.opal_workaround, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
             info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
 
     for test in glb_tests:
         t0, t1, t2 = test_dense_app(test, 
-                                    width, height, args.env_parameters, extra_args)
+                                    width, height, args.env_parameters, extra_args, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
         info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
 
     for test in glb_tests_fp:
         t0, t1, t2 = test_dense_app(test, 
-                                    width, height, args.env_parameters, extra_args, use_fp=True)
+                                    width, height, args.env_parameters, extra_args, use_fp=True, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
         info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
 
     for test in resnet_tests:
         if "residual" in test:
             t0, t1, t2 = test_dense_app("apps/resnet_residual",
-                                        width, height, args.env_parameters, extra_args, layer=test)
+                                        width, height, args.env_parameters, extra_args, layer=test, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
             info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
         else:
             t0, t1, t2 = test_dense_app("apps/resnet_output_stationary",
-                                        width, height, args.env_parameters, extra_args, layer=test)
+                                        width, height, args.env_parameters, extra_args, layer=test, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
             info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
 
     for test in resnet_tests_fp:
         if "residual" in test:
             t0, t1, t2 = test_dense_app("apps/conv2D_residual_fp",
-                                        width, height, args.env_parameters, extra_args, layer=test, use_fp=True)
+                                        width, height, args.env_parameters, extra_args, layer=test, use_fp=True, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
             info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
         else:
             t0, t1, t2 = test_dense_app("apps/conv2D_fp",
-                                        width, height, args.env_parameters, extra_args, layer=test, use_fp=True)
+                                        width, height, args.env_parameters, extra_args, layer=test, use_fp=True, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
             info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
 
     for test in hardcoded_dense_tests:
         t0, t1, t2 = test_hardcoded_dense_app(test,
-                                    width, height, args.env_parameters, extra_args)
+                                    width, height, args.env_parameters, extra_args, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
         info.append([test + "_glb", t0 + t1 + t2, t0, t1, t2])
 
     if args.include_dense_only_tests:
@@ -560,22 +595,22 @@ def dispatch(args, extra_args=None):
         if os.WEXITSTATUS(exit_status) != 0:
             raise RuntimeError(f"Command 'rm /aha/garnet/garnet.v' returned non-zero exit status {os.WEXITSTATUS(exit_status)}.")
 
-        t = gen_garnet(width, height, dense_only=True)
+        t = gen_garnet(width, height, dense_only=True, using_matrix_unit=using_matrix_unit, mu_datawidth=mu_datawidth)
         info.append(["garnet with dense only", t])
 
         num_dense_only_glb_tests = 5
         for test_index, test in enumerate(glb_tests):
             if test_index == num_dense_only_glb_tests:
                 break
             t0, t1, t2 = test_dense_app(test, 
-                                        width, height, args.env_parameters, extra_args, dense_only=True)
+                                        width, height, args.env_parameters, extra_args, dense_only=True, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
             info.append([test + "_glb dense only", t0 + t1 + t2, t0, t1, t2])
 
         for test in resnet_tests:
             # residual resnet test is not working with dense only mode
             if "residual" not in test:
                 t0, t1, t2 = test_dense_app("apps/resnet_output_stationary",
-                                            width, height, args.env_parameters, extra_args, layer=test)
+                                            width, height, args.env_parameters, extra_args, layer=test, using_matrix_unit=using_matrix_unit, cgra_height=height, mu_datawidth=mu_datawidth)
                 info.append([test + "_glb dense only", t0 + t1 + t2, t0, t1, t2])
 
     print(f"+++ TIMING INFO", flush=True)

diff --git a/archipelago b/archipelago
diff --git a/canal b/canal
diff --git a/garnet b/garnet
diff --git a/lake b/lake
+6 −5		archipelago/pipeline.py
+6 −1		archipelago/pnr_.py
+18 −16		archipelago/sta.py
+1 −0		canal/cyclone.py
+4 −3		canal/global_signal.py
+2 −0		canal/interconnect.py
+19 −11		canal/util.py
+22 −11		cgra/util_onyx.py
+38 −6		cgra/wiring_onyx.py
+81 −18		garnet.py
+2 −2		global_buffer/design/global_buffer.py
+3 −0		global_buffer/design/global_buffer_parameter.py
+14 −7		global_buffer/io_placement.py
+1 −0		global_controller/global_controller_genesis2.py
+4 −2		global_controller/global_controller_magma.py
+2 −1		global_controller/rtl/genesis/glc_axi_addrmap.svp
+4 −3		global_controller/rtl/genesis/glc_jtag_ctrl.svp
+3 −2		global_controller/rtl/genesis/global_controller.svp
+1 −1		mapper/netlist_util.py
+33 −20		memory_core/memtile_util.py
+136 −0		memory_core/mu2f_io_core_rv.py
+13 −4		passes/interconnect_port_pass/interconnect_port_pass.py
+10 −1		tests/test_app/Makefile
+141 −0		tests/test_app/genesis_tb/top.svp
+30 −3		tests/test_app/lib/map.c
+11 −2		tests/test_app/lib/parser.c
+12 −8		tests/test_app/tb/environment.sv
+60 −5		tests/test_app/tb/top.sv
+19 −7		tests/test_memory_core/build_tb.py
+348 −0		lake/modules/mu2f_io_core.py
+36 −0		lake/modules/mux.py
+199 −0		tests/Makefile
+7 −0		tests/dump_fsdb.tcl
+161 −0		tests/mu2f_io_core_tb.sv