diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7f61b83c9..a00a306ea 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,5 +26,5 @@ jobs: source $VITIS/settings64.sh source /opt/xilinx/xrt/setup.sh export LOCAL_CI_TEST=1 - which vivado_hls - python tests/test_cont_integration.py + export AUTOSA=/work/shared/common/AutoSA + python tests/test_continuous_integration.py diff --git a/python/heterocl/autosa.py b/python/heterocl/autosa.py new file mode 100644 index 000000000..86b492971 --- /dev/null +++ b/python/heterocl/autosa.py @@ -0,0 +1,292 @@ +import re +import os +import copy +import sys +import time +from .util import run_process +from .devices import Project, Platform + +# Static class for entries of each SA module +class SystolicArrayRegistry(object): + sa_module_cnt = 0 + +def count_SA_size(code): + pos = code.rfind("PE_wrapper") + function = code[pos:pos+100] + dims = re.findall(" (\d+),", function) + if len(dims) < 2: + print("Failed to generate 2d SA. Size", dims) + sys.exit() + + dimX, dimY = int(dims[0])+1, int(dims[1])+1 + print(f"[ INFO ] generating SA dimnesion {dimX}x{dimY}.") + +def indent(num): + return " " * num + +def get_function_code(name, code): + pos = code.find(name) + start_pos = pos - len("inline void") + end_pos = code.find("/* Helper", pos) + return code[start_pos:end_pos] + + +def get_ser_size(code): + lines = code.split("\n") + pattern = "<= (\d+);" + size = 1 + for line in lines: + rets = re.findall(pattern, line) + if len(rets) > 0: + assert len(rets) == 1 + size *= (int(rets[0])+1) + else: continue + return size + +def insert_data_pack(ret_code, header, off_chip_data, written_data): + ret_code = ret_code.replace("buffer_", "").replace("[0]", "") + # Extract the designated data types + pattern = re.findall("autosa_func\((.*?)\)", ret_code)[0] + args = pattern.split(", ") + signature = re.findall("autosa_func\((.*?)\);", header) + + # If the arg is accessed from off-chip memory, then we repalce the typedef + # with target packed data type + types = signature[0].split(", ") + for t in types: + for arg in args: + if arg in t: + pattern = "_t(\d+) " + target_type = re.findall(pattern, t)[0] + target_type_bits = int(target_type) * 32 + # Off-chip coalesced data access + if arg in off_chip_data: + header = f"#undef {arg}_t\n#define {arg}_t ap_uint<{target_type_bits}>\n" + header + + # Insert data packing and (de)serialization + # Create a new buffer and reshape it to original buffer after or before AutoSA func call + else: + if arg in written_data: + print(f"[ INFO ] Writing to on-chip memory {arg}. Packed into ap_uint<{target_type_bits}>...") + # ALlocate new buffer and perform data deserialization + deser_func = f"host_deserialize_{arg}" + # Check if the size matches + code = get_function_code(deser_func, header) + size = get_ser_size(code) + ret_code = ret_code.replace(arg, f"{arg}_sa") + ret_code = f"float {arg}_sa[{size}];\n" + indent(5) + ret_code + \ + indent(6) + f"{deser_func}({arg}, {arg}_sa);\n" + else: + pass + + return ret_code, header + +# Update HLS function names in the generated Extern IP core +def add_prefix(header, ret_code): + # Preserved function keywords in AutoSA generated code + function_list = [ + "autosa_func", "PE_wrapper", "PE" + ] + index = SystolicArrayRegistry.sa_module_cnt + for f in function_list: + header = header.replace(f"{f}(", f" inst{index}_{f}(") + ret_code = ret_code.replace(f"{f}(", f" inst{index}_{f}(") + SystolicArrayRegistry.sa_module_cnt += 1 + return header, ret_code + +def infer_default_params(loop_bounds): + assert len(loop_bounds) > 1, loop_bounds + extra_flags = "--simd-info=./autosa_tests/mm_hcl/simd_info.json " + # Params for MatMul + if len(loop_bounds) == 3: + loop_bounds = [ int(_) for _ in loop_bounds ] + m, n, k = loop_bounds + if m > 1 and n > 1 and k > 1: + ST = 3 + SA_dim_x = 4 + SA_dim_y = 4 + PART = f"{m},{n},{k}" + if m > 256 or n > 256 or k > 256: LAT = [16,16] + else: LAT = [ int(m/SA_dim_x), int(n/SA_dim_y) ] + LAT = [ str(1) if _ == 0 else str(_) for _ in LAT ] + LAT = ",".join(LAT) + SIMD = k if k <= 8 else 4 + # Map reduction loop to space dim + else: + ST = 2 + PART = "10,16" + LAT = "2,2" + SIMD = 4 + extra_flags += "--local-reduce --reduce-op=\"+\" --simd-touch-space " + + # Params for Conv + else: + OC, OH, OW, IC, R, C = loop_bounds + ST = 4 + print(f"[ INFO ] input size OC({OC}), OH({OH}), OW({OW}), IC({IC}), R({R}), C({C})") + PART = "16,13,13,1" + LAT = "2,1,2" + SIMD = "1,1,2,4" + extra_flags = "--simd-info=./autosa_tests/cnn/simd_info.json " + return ST, PART, LAT, SIMD, extra_flags + +def generate_systolic_array(keys, values, code, backend): + # Analyze packing and transpose information + input_attr_info = dict() + packed_data = list() + transposed_data = list() + + is_axis_enabled = False + loop_bounds = list() + off_chip_data = list() + written_data = list() + + # Process attribute information for AutoSA module + for index in range(len(keys)): + key = keys[index].value + if key == "axis": + is_axis_enabled = True + continue + elif key == "loop_bound": + loop_bounds = values[index].value.split(",") + elif key == "tensor_placement": + info = values[index].value.split(",") + for var in info: + var_name = var.replace("[0]", "").replace("[1]", "") + var_name = var_name.replace("[read]", "").replace("[write]", "") + if "[0]" in var: + off_chip_data.append(var_name) + if "[write]" in var: + written_data.append(var_name) + else: + try: + is_transpose, pack_factor = values[index].value.split(",") + input_attr_info[var] = [int(is_transpose), int(pack_factor)] + if int(pack_factor) > 0: + packed_data.append(var) + if int(is_transpose) == 1: + transposed_data.append(var) + except: + pass + + instance = SystolicArrayRegistry.sa_module_cnt + autosa_c_source = f"hcl_autosa_tmp_inst{instance}.c" + pwd = os.getcwd() + with open(autosa_c_source, "w") as fp: + fp.write("#include \n") + fp.write("int main(int argc, char **argv) {\n") + fp.write(code) + fp.write("}") + + header = "#include \n" + ret_code = "autosa_func(args);\n" + + # check autosa installation + autosa_dir = "" + try: + autosa_dir = os.environ["AUTOSA"] + except: + print("[{}] WARNING: AutoSA not found. Please setup env variable AUTOSA".format(time.strftime("%H:%M:%S", time.gmtime()))) + if autosa_dir == "": + ret_code = "// Not found AutoSA. returns function placeholder\n" + indent(6) + ret_code + header = "" + return [header, ret_code] + + source_path = os.path.join(pwd, autosa_c_source) + cmd = "cd {}; ".format(autosa_dir) + cmd += "./autosa " + cmd += "{} ".format(source_path) + cmd += "--config=./autosa_config/autosa_config.json " + if backend == "vhls": + cmd += "--target=autosa_hls_c " + elif backend == "aocl": + cmd += "--target=autosa_opencl " + else: + raise RuntimeError(f"Illegal backend {backend}") + cmd += "--output-dir=./autosa.tmp/output " + + # Get the default value + ST, PART, LAT, SIMD, extra_flags = infer_default_params(loop_bounds) + # Internal debugging interface to set up the params + sa_space_time = os.getenv("SA_SPACE_TIME", ST) + sa_array_part = os.getenv("SA_ARRAY_PAR", PART) + sa_lat_hiding = os.getenv("SA_LAT_HIDING", LAT) + sa_simd = os.getenv("SA_SIMD", SIMD) + + print(f"[ INFO ] AutoSA params: Array partition {sa_array_part}. Latency hiding {sa_lat_hiding}. SIMD{sa_simd}") + cmd += "--sa-sizes=\"{{kernel[]->space_time[{}];".format(sa_space_time) + cmd += "kernel[]->array_part[{}];".format(sa_array_part) + cmd += "kernel[]->latency[{}];".format(sa_lat_hiding) + cmd += "kernel[]->simd[{}]".format(sa_simd) + cmd += "}\" " + + cmd += "--hls " + cmd += "--hcl " + if is_axis_enabled: + pass # cmd += "--axi-stream " + + # configure data packing + if backend == "vhls": + data_pack_config = "" + if len(packed_data) > 0: + data_pack_config = "--data-pack-sizes=\"{" + delim = "" + for var in packed_data: + data_pack_config += delim + "kernel[]->{}[8,32,64]".format(var) + delim = ";" + data_pack_config += "}\" " + + if data_pack_config == "": + data_pack_config = "--no-data-pack " + cmd += data_pack_config + cmd += extra_flags + + # addiitonal flags for intel ocl + if backend == "aocl": + cmd += "--loop-infinitize --double-buffer-style=0 " + + # Add serialization if the SA module has interface arguements + # cmd += "--host-serialize " + print(f"[ INFO ] AutoSA command {cmd}") + + # Save autosa command for debugging purposes + with open(f"hcl_autosa_cmd_inst{instance}.sh", "w") as fp: + fp.write(cmd) + run_process(cmd) + + # Extract the autosa generated code + if backend == "vhls": autosa_header = f"hcl_autosa_tmp_inst{instance}_hcl_decl.h" + else: autosa_header = "hcl_autosa_tmp_kernel.h" + + ext = "cpp" if backend == "vhls" else "cl" + source_file = f"{autosa_dir}/autosa.tmp/output/src/hcl_autosa_tmp_inst{instance}_kernel.{ext}" + with open(source_file, "r") as fp: + header = fp.read() + "\n" + header = header.replace(f"#include \"{autosa_header}\"", "") + + if backend == "aocl": + # Also extract the helper functions for data serialization and deserialization + with open(f"{autosa_dir}/autosa.tmp/output/src/hcl_autosa_tmp_host.h", "r") as f: + content = f.read() + annotation = "/* Helper Function */" + start_pos = content.find(annotation) + end_pos = content.rfind(annotation) + len(annotation) + header += content[start_pos:end_pos] + "\n" + + # For xilinx HLS backend + else: + count_SA_size(header) + + # External module call inside top function + with open(f"{autosa_dir}/autosa.tmp/output/src/{autosa_header}", "r") as fp: + ret_code = fp.readlines()[0].strip() + ";\n" + + # Add prefix to SA functions + header, ret_code = add_prefix(header, ret_code) + + # Bitcasting the input arguments (to AutoSA selected bit-packing factor) + # 1. Substitute data type (interface arg) is decided by AutoSA (and possibly do some extra padding). + # 2. Substitute data serialization size and intrinsic + ret_code, header = insert_data_pack(ret_code, header, off_chip_data, written_data) + + return [ header, ret_code ] \ No newline at end of file diff --git a/python/heterocl/schedule.py b/python/heterocl/schedule.py index 2a2eaa031..a05d11c5f 100644 --- a/python/heterocl/schedule.py +++ b/python/heterocl/schedule.py @@ -265,6 +265,61 @@ def join(self, srcs, dest=None): "inconsistent tensor joining" self.sch.join(target, dest, self[src]) + def transpose(self, tensor=None): + """ transpose a tensor """ + if tensor is not None: + src = None + if isinstance(tensor, tuple): + src, tensor = tensor + src = self.__getitem__(src) + else: + src = self.__getitem__(tensor) + tensor = tensor.tensor + try: + shape = [ int(_.value) for _ in tensor.shape ] + except: + shape = [ int(_) for _ in tensor.shape ] + + target_shape = shape[::-1] + self.cascade_tensor = tensor + self.cascade_source_stage = None + self.sch.transpose(src, tensor, target_shape) + return self + + def pack(self, tensor=None, factor=512): + """ pack data for data transfer """ + if isinstance(tensor, list): + for t in tensor: + ret = self.pack(t, factor=factor) + return self + + if tensor is not None: + if isinstance(tensor, tuple): + src, tensor = tensor + src = self.__getitem__(src) + else: + src = self.__getitem__(tensor) + tensor = tensor.tensor + + try: + shape = [ int(_.value) for _ in tensor.shape ] + except: + shape = [ int(_) for _ in tensor.shape ] + bits = types.get_bitwidth(tensor.dtype) + # Calculate target shape + new_shape = [1] + for index in range(len((shape))): + index = len(shape)-index-1 + bits *= shape[index] + if bits > factor: + new_shape = shape[:index] + [ int(bits/factor) ] + break + + self.cascade_tensor = tensor + self.cascade_source_stage = None + self.sch.transpose(src, tensor, new_shape) + + return self def to(self, tensor, dst=None, src=None, axis=0, mode=_expr.IO.DMA, fifo_depth=1, burst_len=-1): diff --git a/python/heterocl/tvm/build_module.py b/python/heterocl/tvm/build_module.py index 30d417ee8..e5ede8bfc 100755 --- a/python/heterocl/tvm/build_module.py +++ b/python/heterocl/tvm/build_module.py @@ -380,6 +380,9 @@ def lower(sch, stmt = ir_pass.AdjustBufferBinding(stmt, arg_list) stmt = ir_pass.InferStream(stmt, arg_list) stmt = ir_pass.AdjustBufferBinding(stmt, arg_list) + # perform layout transformation + stmt = ir_pass.TransformLayout(stmt, arg_list) + stmt = ir_pass.AdjustBufferBinding(stmt, arg_list) for f in lower_phase3: stmt = f(stmt) if simple_mode: diff --git a/python/heterocl/tvm/runtime.py b/python/heterocl/tvm/runtime.py index e6363f25d..9624afe1a 100644 --- a/python/heterocl/tvm/runtime.py +++ b/python/heterocl/tvm/runtime.py @@ -2,8 +2,8 @@ import os, subprocess, time, re, glob from ..report import parse_xml from ..devices import Project - -debug = True +from ..autosa import generate_systolic_array +from ..util import run_process def find_path(path, fname): file_dir = [] @@ -30,16 +30,6 @@ def replace_text(f_name, prev, new): with open(f_name, 'w') as fp: fp.write(data) -def run_process(cmd, pattern=None, env=None): - if debug: print("[DEBUG] Running commands: \n{}\n".format(cmd)) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) - out, err = p.communicate() - if err: raise RuntimeError("Error raised: ", err.decode()) - if pattern: return re.findall(pattern, out.decode("utf-8")) - if debug: - print("[DEBUG] Commands outputs: \n{}\n".format(out.decode("utf-8"))) - return out.decode("utf-8") - @register_func def exec_init(dev_hash, tool, mode): # check whether pre-compiled bitstream exitsts @@ -74,6 +64,19 @@ def exec_init(dev_hash, tool, mode): @register_func def process_extern_module(attr_key, annotate_keys, annotate_values, code): + if attr_key == "soda": + pos = code.find("#include") + code = code[pos:] + code = code.replace("extern \"C\" {", "") + code = code.replace("} // extern \"C\"", "") + func_call = "" + return [code, func_call] + + # process the AutoSA input HLS code (string) + elif attr_key == "autosa": + backend = "vhls" + return generate_systolic_array(annotate_keys, annotate_values, code, backend) + header, body = "", "" if attr_key == "vhls": kernel_name = "" diff --git a/python/heterocl/tvm/schedule.py b/python/heterocl/tvm/schedule.py index 396bbb7ab..ddb02a97d 100644 --- a/python/heterocl/tvm/schedule.py +++ b/python/heterocl/tvm/schedule.py @@ -9,6 +9,7 @@ from . import expr as _expr from . import stmt as _stmt from . import container as _container +import os @register_node class Buffer(NodeBase): @@ -256,6 +257,9 @@ def reuse_at(self, target, parent, axis, name): def partition(self, target, partition_type, dim, factor): return _api_internal._SchedulePartition(self, target, dim, factor, partition_type) + def transpose(self, src, tensor, target_shape): + return _api_internal._TransformLayout(self, src, tensor, target_shape) + # Create separate python functions for data movement FFIs # Move a stage's loop body to device def in_stage_move(self, target, dst, src, axis=0, @@ -542,7 +546,7 @@ def parallel(self, var): if isinstance(var, int): var = self.op.axis[var] _api_internal._StageParallel(self, var) - + def dataflow(self, var=None): """Create dataflow region inside loop or function body @@ -574,6 +578,11 @@ def pipeline(self, var, initiation_interval=1): def stencil(self, burst_width=512, unroll_factor=1, num_iteration=1): _api_internal._StageStencil(self, burst_width, unroll_factor, num_iteration) + def systolic(self, **kwargs): + for key, value in kwargs.items(): + os.environ[key] = value + _api_internal._StageSystolic(self) + def pragma(self, var, pragma_type): """Annotate the iteration with pragma diff --git a/python/heterocl/util.py b/python/heterocl/util.py index 9ef069007..13eed9356 100644 --- a/python/heterocl/util.py +++ b/python/heterocl/util.py @@ -9,6 +9,7 @@ from .scheme import Scheme from .debug import DTypeError from .mutator import Mutator +import subprocess, re class VarName(): """A counter for each type of variables. @@ -141,3 +142,13 @@ def mutate_BinOp(self, binop, node): def mutate_Cast(self, node): return self.mutate(node.value) + +def run_process(cmd, pattern=None, env=None, debug=True): + if debug: print("[DEBUG] Running commands: \n{}\n".format(cmd)) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) + out, err = p.communicate() + if err: raise RuntimeError("Error raised: ", err.decode()) + if pattern: return re.findall(pattern, out.decode("utf-8")) + if debug: + print("[DEBUG] Commands outputs: \n{}\n".format(out.decode("utf-8"))) + return out.decode("utf-8") diff --git a/samples/gemm/gemm_systolic.py b/samples/gemm/gemm_systolic.py new file mode 100644 index 000000000..dddd1dc79 --- /dev/null +++ b/samples/gemm/gemm_systolic.py @@ -0,0 +1,57 @@ +import heterocl as hcl +import numpy as np +from itertools import permutations +import os, sys +import argparse + +def autosa_systolic_array(size): + m = size + n = size + k = size + + dtype=hcl.Float() + hcl.init(dtype) + + A = hcl.placeholder((m,k), dtype=dtype, name="A") + B = hcl.placeholder((k,n), dtype=dtype, name="B") + + def kernel(A, B): + Y = hcl.compute((m, n), lambda *args: 0, dtype=dtype, name="Y0") + with hcl.Stage("Y"): + with hcl.for_(0, m, name="i") as i: + with hcl.for_(0, n, name="j") as j: + Y[i][j] = 0 + with hcl.for_(0, k, name="k") as r: + Y[i][j] += A[i][r] * B[r][j] + return Y + + # Note that you have to make sure AutoSA binary + # in on the PATH by running which command, otherwise HCL runtime + # will only generate a function placeholder for the GEMM kernel + p = hcl.Platform.xilinx_zc706 + p.config(compiler="vitis", mode="csyn") + + s = hcl.create_schedule([A, B], kernel) + MM = kernel.Y + + s.to([A, B, kernel.Y0], p.xcel) + s.to(kernel.Y.Y0, p.host) + + s[kernel.Y].systolic() + s.transpose(kernel.Y.B) + s.pack([MM.B, MM.A, MM.Y0], factor=512) + + np_A = np.random.randint(10, size=(m,k)) + np_B = np.random.randint(10, size=(k,n)) + np_C = np.zeros((m,n)) + args = (np_A, np_B, np_C) + + print(hcl.lower(s)) + f = hcl.build(s, target=p) + f(hcl.asarray(np_A), hcl.asarray(np_B), hcl.asarray(np_C)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--size', nargs='?', const=1024, type=int, default=1024) + args = parser.parse_args() + autosa_systolic_array(args.size) diff --git a/tests/issues/test_issue_416.py b/tests/issues/test_issue_416.py index 7556f7bae..960d10a9e 100644 --- a/tests/issues/test_issue_416.py +++ b/tests/issues/test_issue_416.py @@ -61,7 +61,7 @@ def update_knn(dist, knn_mat, i, j): # Inputs/Outputs definition (§4) # Scalars (§4.1) - test_image = hcl.placeholder((), "test_image") + test_image = hcl.placeholder((1,), "test_image") # Tensors (§4.2) train_images = hcl.placeholder(data_size, "train_images") @@ -106,8 +106,9 @@ def test_code_gen_knn(): target = hcl.Platform.aws_f1 target.config(compiler="vitis", backend="vhls", mode="debug") code = top(target) - assert "buffer_test_image = test_image" in code, code - + for line in code: + if "buffer_test_image" in line: + assert "sizeof(ap_uint<49>)*1" in line, line if __name__ == "__main__": test_code_gen_knn() diff --git a/tests/test_cont_integration.py b/tests/test_cont_integration.py deleted file mode 100644 index 89616c3e0..000000000 --- a/tests/test_cont_integration.py +++ /dev/null @@ -1,67 +0,0 @@ -import heterocl as hcl -import os -import numpy as np - -def test_vivado_hls(): - def test_hls(target_mode): - hcl.init(hcl.Int(16)) - A = hcl.placeholder((10,), "A") - def kernel(A): - B = hcl.compute(A.shape, lambda *args : A[args] + 1, "B") - return B - - target = hcl.Platform.aws_f1 - s = hcl.create_schedule([A], kernel) - s.to(A, target.xcel) - s.to(kernel.B, target.host) - target.config(compiler="vivado_hls", mode=target_mode) - f = hcl.build(s, target) - - np_A = np.random.randint(10, size=(10,)) - np_B = np.zeros((10,)) - - hcl_A = hcl.asarray(np_A, dtype=hcl.Int(16)) - hcl_B = hcl.asarray(np_B, dtype=hcl.Int(16)) - f(hcl_A, hcl_B) - ret_B = hcl_B.asnumpy() - - report = f.report() - np.testing.assert_array_equal(ret_B, (np_A+1)*1) - - if os.getenv("LOCAL_CI_TEST"): - test_hls("csim|csyn") - else: - assert os.getenv("LOCAL_CI_TEST") == None - -def test_vitis(): - def test_hls(target_mode): - hcl.init(hcl.Int(16)) - A = hcl.placeholder((10,), "A") - def kernel(A): - B = hcl.compute(A.shape, lambda *args : A[args] + 1, "B") - return B - - target = hcl.Platform.aws_f1 - s = hcl.create_schedule([A], kernel) - s.to(A, target.xcel) - s.to(kernel.B, target.host) - target.config(compiler="vitis", mode=target_mode) - f = hcl.build(s, target) - - np_A = np.random.randint(10, size=(10,)) - np_B = np.zeros((10,)) - - hcl_A = hcl.asarray(np_A, dtype=hcl.Int(16)) - hcl_B = hcl.asarray(np_B, dtype=hcl.Int(16)) - f(hcl_A, hcl_B) - ret_B = hcl_B.asnumpy() - np.testing.assert_array_equal(ret_B, (np_A+1)*1) - - if os.getenv("LOCAL_CI_TEST"): - test_hls("sw_sim") - else: - assert os.getenv("LOCAL_CI_TEST") == None - -if __name__ == "__main__": - test_vivado_hls() - test_vitis() diff --git a/tests/test_continuous_integration.py b/tests/test_continuous_integration.py new file mode 100644 index 000000000..024ba0990 --- /dev/null +++ b/tests/test_continuous_integration.py @@ -0,0 +1,127 @@ +import heterocl as hcl +import os +import numpy as np + +def test_vivado_hls(): + def test_hls(target_mode): + hcl.init(hcl.Int(16)) + A = hcl.placeholder((10,), "A") + def kernel(A): + B = hcl.compute(A.shape, lambda *args : A[args] + 1, "B") + return B + + target = hcl.Platform.aws_f1 + s = hcl.create_schedule([A], kernel) + s.to(A, target.xcel) + s.to(kernel.B, target.host) + target.config(compiler="vivado_hls", mode=target_mode) + f = hcl.build(s, target) + + np_A = np.random.randint(10, size=(10,)) + np_B = np.zeros((10,)) + + hcl_A = hcl.asarray(np_A, dtype=hcl.Int(16)) + hcl_B = hcl.asarray(np_B, dtype=hcl.Int(16)) + f(hcl_A, hcl_B) + ret_B = hcl_B.asnumpy() + + report = f.report() + np.testing.assert_array_equal(ret_B, (np_A+1)*1) + + if os.getenv("LOCAL_CI_TEST"): + test_hls("csim|csyn") + else: + assert os.getenv("LOCAL_CI_TEST") == None + +def test_vitis_sim(): + def test_hls(target_mode): + hcl.init(hcl.Int(16)) + A = hcl.placeholder((10,), "A") + def kernel(A): + B = hcl.compute(A.shape, lambda *args : A[args] + 1, "B") + return B + + target = hcl.Platform.aws_f1 + s = hcl.create_schedule([A], kernel) + s.to(A, target.xcel) + s.to(kernel.B, target.host) + target.config(compiler="vitis", mode=target_mode) + f = hcl.build(s, target) + + np_A = np.random.randint(10, size=(10,)) + np_B = np.zeros((10,)) + + hcl_A = hcl.asarray(np_A, dtype=hcl.Int(16)) + hcl_B = hcl.asarray(np_B, dtype=hcl.Int(16)) + f(hcl_A, hcl_B) + ret_B = hcl_B.asnumpy() + np.testing.assert_array_equal(ret_B, (np_A+1)*1) + + if os.getenv("LOCAL_CI_TEST"): + test_hls("sw_sim") + else: + assert os.getenv("LOCAL_CI_TEST") == None + +def test_autosa_backend(): + def test_hls(size, target_mode): + m = size + n = size + k = size + + dtype=hcl.Float() + hcl.init(dtype) + + A = hcl.placeholder((m,k), dtype=dtype, name="A") + B = hcl.placeholder((k,n), dtype=dtype, name="B") + + def kernel(A, B): + Y = hcl.compute((m, n), lambda *args: 0, dtype=dtype, name="Y0") + with hcl.Stage("Y"): + with hcl.for_(0, m, name="i") as i: + with hcl.for_(0, n, name="j") as j: + Y[i][j] = 0 + with hcl.for_(0, k, name="k") as r: + Y[i][j] += A[i][r] * B[r][j] + return Y + + p = hcl.Platform.xilinx_zc706 + p.config(compiler="vitis", mode=target_mode) + + s = hcl.create_schedule([A, B], kernel) + MM = kernel.Y + + s.to([A, B, kernel.Y0], p.xcel) + s.to(kernel.Y.Y0, p.host) + + # intra-kernel data placement to create systolic araray + s[kernel.Y].systolic() + # using .to() as alternative to .systolic() for SA generation + # PEs = s[kernel.Y].unroll(axis=[0,1], explicit=True) + # for r in range(64): + # s.to(PEs[r,0].A, PEs[r,1]).to(PEs[r,2]).to(PEs[r,3])... + # for c in range(64): + # s.to(PEs[0,c].B, PEs[1,c]).to(PEs[2,c]).to(PEs[3,c])... + + s.transpose(kernel.Y.B) + s.pack([MM.B, MM.A, MM.Y0], factor=512) + + np_A = np.random.randint(10, size=(m,k)) + np_B = np.random.randint(10, size=(k,n)) + np_C = np.zeros((m,n)) + args = (np_A, np_B, np_C) + + code = hcl.build(s, target=p) + assert code.count("PE_wrapper") == 8196, \ + "Wrong num of PEs in SA: {}".format(code.count("PE_wrapper")) + + if os.getenv("LOCAL_CI_TEST"): + if os.getenv("AUTOSA"): + assert os.path.exists(os.getenv("AUTOSA")) + test_hls(1024, "debug") + else: + assert os.getenv("LOCAL_CI_TEST") == None + +if __name__ == "__main__": + test_vivado_hls() + test_vitis_sim() + test_autosa_backend() diff --git a/tvm/HalideIR/src/ir/IR.cpp b/tvm/HalideIR/src/ir/IR.cpp index 53f8ff89a..ae1e58081 100644 --- a/tvm/HalideIR/src/ir/IR.cpp +++ b/tvm/HalideIR/src/ir/IR.cpp @@ -1312,6 +1312,9 @@ Call::ConstString Call::memoize_expr = "memoize_expr"; Call::ConstString Call::alloca = "alloca"; Call::ConstString Call::likely = "likely"; Call::ConstString Call::likely_if_innermost = "likely_if_innermost"; +Call::ConstString Call::transpose = "transpose"; +Call::ConstString Call::serialize = "serialize"; +Call::ConstString Call::deserialize = "deserialize"; Call::ConstString Call::register_destructor = "register_destructor"; Call::ConstString Call::div_round_to_zero = "div_round_to_zero"; Call::ConstString Call::mod_round_to_zero = "mod_round_to_zero"; diff --git a/tvm/HalideIR/src/ir/IR.h b/tvm/HalideIR/src/ir/IR.h index 3e4004743..2bfd90fb5 100644 --- a/tvm/HalideIR/src/ir/IR.h +++ b/tvm/HalideIR/src/ir/IR.h @@ -727,10 +727,11 @@ struct Call : public ExprNode { count_trailing_zeros, undef, return_second, if_then_else, glsl_texture_load, glsl_texture_store, glsl_varying, image_load, image_store, make_struct, stringify, memoize_expr, alloca, likely, - likely_if_innermost, register_destructor, div_round_to_zero, - mod_round_to_zero, call_cached_indirect_function, prefetch, - signed_integer_overflow, indeterminate_expression, bool_to_mask, - cast_mask, select_mask, extract_mask_element, size_of_halideir_buffer_t; + likely_if_innermost, transpose, serialize, deserialize, + register_destructor, div_round_to_zero, mod_round_to_zero, + call_cached_indirect_function, prefetch, signed_integer_overflow, + indeterminate_expression, bool_to_mask, cast_mask, select_mask, + extract_mask_element, size_of_halideir_buffer_t; // If it's a call to another halide function, this call node holds // onto a pointer to that function for the purposes of reference // counting only. Self-references in update definitions do not diff --git a/tvm/include/tvm/ir.h b/tvm/include/tvm/ir.h index 03fdb7d0a..4df14396b 100644 --- a/tvm/include/tvm/ir.h +++ b/tvm/include/tvm/ir.h @@ -252,6 +252,8 @@ constexpr const char* bind_scope = "bind_scope"; constexpr const char* stream_scope = "stream_scope"; constexpr const char* stream_attrs = "stream_attrs"; +// Define the desired tensor layout +constexpr const char* tensor_layout_attrs = "tensor_layout_attrs"; } // namespace attr diff --git a/tvm/include/tvm/ir_pass.h b/tvm/include/tvm/ir_pass.h index 2c2cf700b..f9ceda763 100644 --- a/tvm/include/tvm/ir_pass.h +++ b/tvm/include/tvm/ir_pass.h @@ -212,6 +212,7 @@ Stmt RemoveNoOp(Stmt stmt); * \return Transformed stmt. */ Stmt InferStream(Stmt stmt, Array api_args); +Stmt TransformLayout(Stmt stmt, Array api_args); /*! * \brief Split statement into pipeine stages. diff --git a/tvm/include/tvm/schedule.h b/tvm/include/tvm/schedule.h index df6db04e7..0b1728836 100644 --- a/tvm/include/tvm/schedule.h +++ b/tvm/include/tvm/schedule.h @@ -237,6 +237,7 @@ class Stage : public NodeRef { EXPORT Stage& stencil(int burst_width, int unroll_factor, int num_iteration); // NOLINT(*) + EXPORT Stage& systolic(); // NOLINT(*) /*! * \brief Annotate the iteration with pragma * @@ -338,6 +339,12 @@ class Schedule : public NodeRef { EXPORT Tensor reuse_at(const Tensor& target, Stage parent, IterVar axis, std::string name); + EXPORT Array explicit_unroll( + const Tensor& target, const Array axes, bool autosa); + + EXPORT void transform_layout( + Stage parent, const Tensor& target, Array shape); + EXPORT void to_stage(const Tensor& target, Stage dest, int arg_pos, ir::StreamType stream_type, int channel_depth, std::string name); diff --git a/tvm/src/api/api_lang.cc b/tvm/src/api/api_lang.cc index c8c67fa59..9c637e7fe 100644 --- a/tvm/src/api/api_lang.cc +++ b/tvm/src/api/api_lang.cc @@ -301,6 +301,10 @@ TVM_REGISTER_API("_StageStencil").set_body([](TVMArgs args, TVMRetValue* ret) { args[0].operator Stage().stencil(args[1], args[2], args[3]); }); +TVM_REGISTER_API("_StageSystolic").set_body([](TVMArgs args, TVMRetValue* ret) { + args[0].operator Stage().systolic(); +}); + TVM_REGISTER_API("_StagePragma").set_body([](TVMArgs args, TVMRetValue* ret) { args[0].operator Stage().pragma(args[1], args[2]); }); @@ -347,6 +351,12 @@ TVM_REGISTER_API("_SchedulePartition") static_cast(args[4].operator int())); }); +TVM_REGISTER_API("_TransformLayout") + .set_body([](TVMArgs args, TVMRetValue *ret) { + args[0].operator Schedule() + .transform_layout(args[1], args[2], args[3]); + }); + TVM_REGISTER_API("_ScheduleMoveToStage") .set_body([](TVMArgs args, TVMRetValue* ret) { args[0].operator Schedule().to_stage( diff --git a/tvm/src/api/api_pass.cc b/tvm/src/api/api_pass.cc index f5a94990d..0b5177725 100644 --- a/tvm/src/api/api_pass.cc +++ b/tvm/src/api/api_pass.cc @@ -130,6 +130,7 @@ REGISTER_PASS2(InjectDoubleBuffer); REGISTER_PASS2(LoopPartition); REGISTER_PASS1(RemoveNoOp); REGISTER_PASS2(InferStream); +REGISTER_PASS2(TransformLayout); REGISTER_PASS2(SplitPipeline); REGISTER_PASS2(LiftAttrScope); REGISTER_PASS1(NarrowChannelAccess); diff --git a/tvm/src/codegen/build_util.cc b/tvm/src/codegen/build_util.cc index 7fdd2fb88..99d8e399b 100644 --- a/tvm/src/codegen/build_util.cc +++ b/tvm/src/codegen/build_util.cc @@ -368,7 +368,7 @@ void PrintCopyBack(TVMArray* arr, std::vector arg_names, } } -// generate kernel code into files +// Generate kernel code into files void GenKernelCode(std::string& test_file, std::vector arg_names, std::string platform, std::string backend, std::string project) { @@ -380,28 +380,27 @@ void GenKernelCode(std::string& test_file, std::vector arg_names, if (platform == "aocl") kernel_ext = "cl"; stream.open(project + "/kernel." + kernel_ext); - // generate hash + // Generate hash for source kernel file std::hash hasher; stream << "// HASH:" << ((size_t)hasher(test_file) & 0xFFFFFFFF) << "\n"; - // create typedef and header + // Create typedef and header if (platform == "vivado_hls" || platform == "sdsoc") { - // add header file to host code + // Add header file to host code auto pos = test_file.rfind("#include "); auto next = test_file.find('\n', pos); test_file.insert(next + 1, "#include \"kernel.h\"\n"); - // create typedef list + // Create typedef list std::unordered_map typedef_map( {{"ap_uint<32>", "ubit32"}, {"ap_int<32>", "bit32"}}); - for (auto& kv : typedef_map) { while (test_file.find(kv.first) != std::string::npos) test_file.replace(test_file.find(kv.first), kv.first.length(), kv.second); } - // generate header file + // Generate header file std::ofstream header; header.open(project + "/kernel.h"); header << "#ifndef __KERNEL_H__\n" @@ -413,14 +412,14 @@ void GenKernelCode(std::string& test_file, std::vector arg_names, header << "typedef " << kv.first << " " << kv.second << ";\n"; } - // locate top function + // Locate top function CHECK(test_file.find("test(") != std::string::npos) << "cannot find top function"; size_t dut = test_file.find("test("); size_t begin = test_file.rfind('\n', dut); size_t end = test_file.find(')', dut) + 1; - // TODO(hecmay): better way to specify prgamas + // TODO(hecmay): better way to specify pragmas if (platform == "sdsoc") { // TODO(hecmay): direct memory interface with PL and DDR header << "#pragma SDS data copy("; diff --git a/tvm/src/codegen/codegen_c.cc b/tvm/src/codegen/codegen_c.cc index 710a3da50..9257717f3 100644 --- a/tvm/src/codegen/codegen_c.cc +++ b/tvm/src/codegen/codegen_c.cc @@ -860,6 +860,12 @@ void CodeGenC::VisitExpr_(const Call* op, std::ostream& os) { // NOLINT(*) os << "("; this->PrintExpr(op->args[0], os); os << " == NULL)"; + } else if (op->is_intrinsic(Call::transpose)) { + LOG(WARNING) << "Intrinsic transpose not implemented yet"; + } else if (op->is_intrinsic(Call::serialize)) { + LOG(WARNING) << "Intrinsic serialize not implemented yet"; + } else if (op->is_intrinsic(Call::deserialize)) { + LOG(WARNING) << "Intrinsic deserialize not implemented yet"; } else { if (op->call_type == Call::Intrinsic || op->call_type == Call::PureIntrinsic) { diff --git a/tvm/src/codegen/opencl/codegen_xocl_host.cc b/tvm/src/codegen/opencl/codegen_xocl_host.cc index 65a84d1b7..4880c8648 100644 --- a/tvm/src/codegen/opencl/codegen_xocl_host.cc +++ b/tvm/src/codegen/opencl/codegen_xocl_host.cc @@ -215,6 +215,66 @@ void CodeGenXOCLHost::VisitStmt_(const Allocate* op) { this->PrintStmt(op->body); } +void CodeGenXOCLHost::VisitExpr_(const Call* op, + std::ostream& os) { // NOLINT(*) + if (op->is_intrinsic(Call::transpose)) { + CHECK_EQ(op->args.size(), 3); + decl_stream << "#include \n"; + decl_stream << R"( +template +void transpose(RandomIterator first, RandomIterator last, int m) +{ + const int mn1 = (last - first - 1); + const int n = (last - first) / m; + std::vector visited(last - first); + RandomIterator cycle = first; + while (++cycle != last) { + if (visited[cycle - first]) + continue; + int a = cycle - first; + do { + a = a == mn1 ? mn1 : (n * a) % mn1; + std::swap(*(first + a), *cycle); + visited[a] = true; + } while ((first + a) != cycle); + } +} +)"; + + // Expected output: transpose(B, B+size, dim0) + os << "transpose("; + this->PrintExpr(op->args[0], os); + os << ".begin(), "; + this->PrintExpr(op->args[0], os); + os << ".end(), " << op->args[2] << ")"; + + } else if (op->is_intrinsic(Call::serialize)) { + // Expected serilization in host program + // std::vector> dev_A(SIZE); + // host_serialize_A(dev_A, A); + CHECK_EQ(op->args.size(), 2); + auto ptr = op->args[0].as(); + auto name = ptr->value; + auto type = op->args[1].as()->value; + // Create an align allocator for device memory + // Since the seriliazation buffer size depends on the access pattern + // and is decided by AutoSA. Here we just leave a placeholder and + // leave to code post-processing to substitute it + os << "host_serialize_" << name << "(" << name << "_dev_ser.data(), " + << name << ".data())"; + + } else if (op->is_intrinsic(Call::deserialize)) { + CHECK_EQ(op->args.size(), 2); + auto ptr = op->args[0].as(); + auto name = ptr->value; + os << "host_deserialize_" << name << "(" << name << ".data(), " << name + << "_dev_deser.data())"; + + } else { + CodeGenC::VisitExpr_(op, os); + } +} + void CodeGenXOCLHost::VisitStmt_(const KernelStmt* op) { using TVM::ir::IoInfo; std::string name = op->name; diff --git a/tvm/src/codegen/opencl/codegen_xocl_host.h b/tvm/src/codegen/opencl/codegen_xocl_host.h index 00d855fe1..025f864e6 100644 --- a/tvm/src/codegen/opencl/codegen_xocl_host.h +++ b/tvm/src/codegen/opencl/codegen_xocl_host.h @@ -21,6 +21,7 @@ class CodeGenXOCLHost : public CodeGenC { void VisitExpr_(const Min* op, std::ostream& os) override; void VisitExpr_(const Max* op, std::ostream& os) override; + void VisitExpr_(const Call* op, std::ostream& os) override; void VisitStmt_(const For* op) override; void VisitStmt_(const IfThenElse* op) override; diff --git a/tvm/src/pass/adjust_buffer_binding.cc b/tvm/src/pass/adjust_buffer_binding.cc index 7b3992441..0b22ef64e 100644 --- a/tvm/src/pass/adjust_buffer_binding.cc +++ b/tvm/src/pass/adjust_buffer_binding.cc @@ -1,6 +1,6 @@ /*! * Copyright (c) 2019 by Contributors - * \file adjust_buffer_binding.cc + * \file loop_partition.cc */ #include #include @@ -33,6 +33,42 @@ class BufferBindingAdjuster final : public IRMutator { return IRMutator::Mutate_(op, s); } + Stmt Mutate_(const Stencil* op, const Stmt& s) { + Array new_inputs; + Array new_outputs; + for (auto& e : op->inputs) { + if (HandleUse(e)) { + HCL_DEBUG_LEVEL(2) << "Undefined Stencil input: " << e; + CHECK(e.as()); + auto name = e.as()->name_hint; + CHECK(name_var_map_.count(name)) << name; + VarExpr new_buf(name_var_map_[name].node_); + new_inputs.push_back(new_buf); + } else { + new_inputs.push_back(e); + } + } + + for (auto& e : op->outputs) { + if (HandleUse(e)) { + HCL_DEBUG_LEVEL(2) << "Undefined Stencil output: " << e; + CHECK(e.as()); + auto name = e.as()->name_hint; + if (name_var_map_.count(name)) { + VarExpr new_buf(name_var_map_[name].node_); + new_outputs.push_back(new_buf); + } else { + new_outputs.push_back(e); + } + } else { + new_outputs.push_back(e); + } + } + Stmt body = this->Mutate(op->body); + return Stencil::make(new_inputs, new_outputs, body, op->burst_width, + op->unroll_factor, op->num_iteration); + } + Expr Mutate_(const Let* op, const Expr& e) { HandleDef(op->var); return this->Mutate(op->body); @@ -117,15 +153,24 @@ class BufferBindingAdjuster final : public IRMutator { return IRMutator::Mutate_(op, s); } - Expr Mutate_(const Variable* op, const Expr& e) { - if (HandleUse(e)) { - HCL_DEBUG_LEVEL(2) << "Undefined Variable buffer: " << e; - auto buffer_name = op->name_hint; - CHECK(name_var_map_.count(buffer_name)) << buffer_name; - VarExpr new_buf(name_var_map_[buffer_name].node_); - return new_buf; + Expr Mutate_(const Call* op, const Expr& e) { + if (op->is_intrinsic(Call::transpose)) { + CHECK_EQ(op->args.size(), 3); + if (HandleUse(op->args[0])) { + auto var = op->args[0].as(); + CHECK(var); + HCL_DEBUG_LEVEL(2) << "Undefined instrinsic buffer: " << e; + auto buffer_name = var->name_hint; + CHECK(name_var_map_.count(buffer_name)) << buffer_name; + VarExpr new_buf(name_var_map_[buffer_name].node_); + return Call::make(Int(32), "transpose", + {new_buf, op->args[1], op->args[2]}, Call::Intrinsic); + } else { + return IRMutator::Mutate_(op, e); + } + } else { + return IRMutator::Mutate_(op, e); } - return IRMutator::Mutate_(op, e); } Stmt Mutate_(const Partition* op, const Stmt& s) { @@ -166,6 +211,13 @@ class BufferBindingAdjuster final : public IRMutator { return IRMutator::Mutate_(op, e); } + Expr Mutate_(const Variable* op, const Expr& e) { + if (HandleUse(e)) { + HCL_DEBUG_LEVEL(2) << "Undefined Variable buffer: " << e; + } + return IRMutator::Mutate_(op, e); + } + Expr Mutate_(const StreamExpr* op, const Expr& e) { if (HandleUse(op->buffer_var)) { HCL_DEBUG_LEVEL(2) << "Undefined StreamExpr buffer: " << e; @@ -245,18 +297,13 @@ Stmt AdjustBufferBinding(Stmt stmt, Array arg_list) { shape_map[node->data.get()] = node->shape; input_args.push_back(node->data); buffer_map[node->data.get()] = node->data; - } else { - const Variable* v = arg_list[i].as(); - CHECK(v) << "Illegal argument " << arg_list[i]; - Var input_var(arg_list[i].node_); - shape_map[v] = {1}; - input_args.push_back(input_var); - buffer_map[v] = input_var; } } Array undefined = UndefinedVars(stmt, input_args); if (undefined.size() > 0) { HCL_DEBUG_LEVEL(2) << "Fonud mismatching buffers in the stmt..."; + HCL_DEBUG_LEVEL(2) << "----------------- stmt -----------------"; + HCL_DEBUG_LEVEL(2) << stmt; for (auto& v : undefined) { HCL_DEBUG_LEVEL(2) << " " << v << "(" << v.get() << ")"; } diff --git a/tvm/src/pass/stream_inference.cc b/tvm/src/pass/stream_inference.cc index b8dd568a9..6b4e4a725 100644 --- a/tvm/src/pass/stream_inference.cc +++ b/tvm/src/pass/stream_inference.cc @@ -1834,8 +1834,23 @@ class FifoAccessChecker final : public IRMutator { std::map min_map_; }; +// Collect loop nest loop bound information +class CollectLoopNestBound final : public IRMutator { + public: + vector bounds; + Stmt Mutate_(const For* op, const Stmt& s) { + bounds.push_back(Simplify(op->extent)); + Stmt stmt = this->Mutate(op->body); + return For::make(op->loop_var, op->min, op->extent, op->for_type, + op->device_api, stmt, op->annotate_keys, + op->annotate_values); + } +}; + class ExternModuleFormater final : public IRMutator { public: + ExternModuleFormater(unordered_set top_arg_names) + : top_arg_names_(top_arg_names) {} // Collect information of streamed module args Stmt Mutate_(const ExternModule* op, const Stmt& s) { if (collect_info) { @@ -1880,6 +1895,59 @@ class ExternModuleFormater final : public IRMutator { } else { CHECK(port_types_map.count(op)); CHECK(arg_names_map.count(op)); + + // Collect and inject loop information into ExternMod node + if (op->attr_key == "autosa") { + Expr value = this->Mutate(op->value); + Stmt body = this->Mutate(op->body); + auto annotate_keys = op->annotate_keys; + auto annotate_values = op->annotate_values; + + // Collect loop bound information + CollectLoopNestBound collector; + collector.Mutate(op->body); + annotate_keys.push_back(StringImm::make("loop_bound")); + std::string bound_info; + std::string delim = ""; + for (auto& e : collector.bounds) { + CHECK(e.as()) << e; + bound_info += delim + std::to_string(e.as()->value); + delim = ","; + } + annotate_values.push_back(StringImm::make(bound_info)); + + // Collect input tensor placement and read/write information + Array input_vars = UndefinedVars(body, Array()); + Array input_args; + for (auto& v : input_vars) { + input_args.push_back(v); + } + InputDirectionCollector idc(input_args); + auto is_arg_written = idc.Analyze(body); + + annotate_keys.push_back(StringImm::make("tensor_placement")); + delim = ""; + std::string placement_info; + for (auto& var : input_vars) { + string var_name = var.get()->name_hint; + placement_info += delim + var_name; + if (top_arg_names_.find(var_name) != top_arg_names_.end()) { + placement_info += "[0]"; // located on off-chip memory + } else { + placement_info += "[1]"; // loacted on on-chip memory + } + CHECK(is_arg_written.count(var_name)) << var_name; + if (is_arg_written.at(var_name)) { + placement_info += "[write]"; + } else { + placement_info += "[read]"; + } + delim = ","; + } + annotate_values.push_back(StringImm::make(placement_info)); + return ExternModule::make(op->attr_key, value, body, annotate_keys, + annotate_values); + } } Stmt stmt = IRMutator::Mutate_(op, s); @@ -1924,6 +1992,7 @@ class ExternModuleFormater final : public IRMutator { return Mutate(stmt); } + unordered_set top_arg_names_; bool collect_info{false}; unordered_map> port_types_map; unordered_map> arg_names_map; @@ -2453,7 +2522,7 @@ Stmt InferStream(Stmt stmt, Array api_args) { // Check the Extern Module // Convert streaming FIFOs into StreamAlloc - ExternModuleFormater emf; + ExternModuleFormater emf(sic.top_arg_names); stmt = emf.Format(stmt); // Handle self loopback streaming channels diff --git a/tvm/src/pass/transform_layout.cc b/tvm/src/pass/transform_layout.cc new file mode 100644 index 000000000..6f9bc800c --- /dev/null +++ b/tvm/src/pass/transform_layout.cc @@ -0,0 +1,866 @@ +/*! + * Copyright (c) 2021 by Contributors + * Restore task graph and tranform layout. + */ +// Transform the tensor layout based on annotation +#include +#include +#include +#include +#include +#include +#include +#include "../arithmetic/compute_expr.h" + +namespace TVM { +namespace ir { + +using std::string; +using std::unordered_map; +using std::unordered_set; +using std::vector; + +struct TaskNode { + string name; + // Tensor being updated in the task + unordered_set updated_tensors; + // Input tensors needed by the task + unordered_set input_tensors; + // Children tasks name + unordered_set children; + // Parent tasks name + unordered_set parents; +}; + +struct TransformInfo { + string name; + VarExpr var; + string anchor_producer; + Array origin_shape; + Array target_shape; + Type origin_type; + Type type; + bool is_transpose; + bool is_pack; + int pack_factor; + bool is_written; +}; + +class TensorSubstitution final : public IRMutator { + public: + TensorSubstitution(unordered_map& vmap) + : vmap_(vmap) {} + + Stmt Mutate_(const KernelStmt* op, const Stmt& s) final { + Array new_args; + for (auto& e : op->args) { + auto ptr = e.as(); + CHECK(ptr) << e; + bool is_found = false; + Expr new_buf; + for (auto& kv : vmap_) { + if (kv.first->name_hint == ptr->name_hint) { + HCL_DEBUG_LEVEL(2) << " -- [substitute] " << ptr->name_hint + << " in kernel " << op->name; + is_found = true; + new_buf = Expr(kv.second.node_); + } + } + if (is_found) { + CHECK(new_buf.defined()); + new_args.push_back(new_buf); + } else { + new_args.push_back(e); + } + } + return KernelStmt::make(new_args, op->name, op->annotate_keys, + op->annotate_values); + } + unordered_map& vmap_; +}; + +Stmt SubstituteTensor(Stmt s, unordered_map vmap) { + return TensorSubstitution(vmap).Mutate(s); +} + +// Return string repr of type +string Type2Str(Type type) { + string str = "int"; + if (type.code() == Type::Float) { + str = "float"; + } else if (type.code() == Type::Int) { + str = "int"; + } else if (type.code() == Type::UInt) { + str = "uint"; + } + return str + std::to_string(type.bits()); +} + +class TransformedBufferInserter final : public IRMutator { + public: + TransformedBufferInserter(std::string target_producer, TransformInfo& info) + : target_producer_(target_producer), info_(info) {} + + // Insert buffer before the producer stage + Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) { + if (op->is_producer) { + std::string name = op->func->func_name(); + if (name == target_producer_) { + Stmt body = this->Mutate(op->body); + HCL_DEBUG_LEVEL(2) << "[ debug ] insert layout transformation before " + << name; + VarExpr var(info_.name + ".new"); + VarExpr old_var(info_.var.node_); + Type type = info_.type; + Array origin_shape = info_.origin_shape; + + std::string dtype; + if (info_.origin_type.code() == Type::Int) { + dtype = "int"; + } else if (info_.origin_type.code() == Type::UInt) { + dtype = "uint"; + } else if (info_.origin_type.code() == Type::Float) { + dtype = "float"; + } + + // For packed-only var on interface, since the passed into memory + // is stored in major fashion continuously, so the data is automatically + // packed already + if (info_.is_pack && !info_.is_transpose) { + // Insert serilization intrisic for AutoSA + if (!info_.is_written) { + // Insert allocation dev_ser_tensor + VarExpr new_var(info_.name + ".dev.ser"); + unordered_map vmap; + vmap[info_.var.get()] = new_var; + + // resize the serialized buffer since if may have replicates + // due to certain access pattern + body = SubstituteTensor(body, vmap); + body = ProducerConsumer::make(op->func, op->is_producer, body); + + Stmt serialize = Evaluate::make(Call::make( + Int(32), "serialize", {info_.name, dtype}, Call::Intrinsic)); + body = Block::make(serialize, body); + + body = + Allocate::make(new_var, info_.origin_type, info_.origin_shape, + make_const(Bool(type.lanes()), true), body); + body = AttrStmt::make(new_var, attr::storage_scope, + StringImm::make("global"), body); + return body; + + } else { + VarExpr new_var(info_.name + ".dev.deser"); + unordered_map vmap; + vmap[info_.var.get()] = new_var; + body = SubstituteTensor(body, vmap); + body = ProducerConsumer::make(op->func, op->is_producer, body); + + Stmt deserialize = Evaluate::make(Call::make( + Int(32), "deserialize", {info_.name, dtype}, Call::Intrinsic)); + body = ProducerConsumer::make(op->func, op->is_producer, body); + body = Block::make(body, deserialize); + + body = + Allocate::make(new_var, info_.origin_type, info_.origin_shape, + make_const(Bool(type.lanes()), true), body); + body = AttrStmt::make(new_var, attr::storage_scope, + StringImm::make("global"), body); + return body; + } + + // Insert an instrinsic to do in-place matrix tranposition + } else if (info_.is_transpose) { + int size = 1; + for (auto& dim : origin_shape) { + auto ptr = dim.as(); + CHECK(ptr); + size *= ptr->value; + } + + VarExpr new_var(info_.name + ".dev.ser"); + unordered_map vmap; + vmap[info_.var.get()] = new_var; + body = SubstituteTensor(body, vmap); + body = ProducerConsumer::make(op->func, op->is_producer, body); + + Stmt serialize = Evaluate::make(Call::make( + Int(32), "serialize", {info_.name, dtype}, Call::Intrinsic)); + body = Block::make(serialize, body); + + body = Allocate::make(new_var, info_.origin_type, info_.origin_shape, + make_const(Bool(type.lanes()), true), body); + body = AttrStmt::make(new_var, attr::storage_scope, + StringImm::make("global"), body); + + // In-place matrix transposition + Stmt trans = Evaluate::make( + Call::make(Int(32), "transpose", {old_var, size, origin_shape[0]}, + Call::Intrinsic)); + body = Block::make(trans, body); + return body; + + // Insert reshaping logic explicitly + } else { + // Substitute buffer + unordered_map vmap; + vmap[info_.var.get()] = var; + body = SubstituteTensor(body, vmap); + HCL_DEBUG_LEVEL(2) << "------------- Substitue ---------"; + HCL_DEBUG_LEVEL(2) << " from " << info_.var << " to " << var; + HCL_DEBUG_LEVEL(2) << "Inside body: " << body; + } + + // Insert pack-only loop + if (info_.is_pack) { + std::vector indices, new_indices; + std::vector loop_vars; + std::unordered_map range_; + for (size_t i = 0; i < origin_shape.size(); i++) { + VarExpr iter(name + ".pack.r" + std::to_string(i)); + indices.push_back(iter); + new_indices.push_back(iter); + loop_vars.push_back(iter); + range_[iter.get()] = Simplify(origin_shape[i] - 1); + } + // Dim for data packing + VarExpr iter(name + ".pack.r"); + indices.push_back(iter); + loop_vars.push_back(iter); + + // Expected output IR (example 512-packing) + // for i (0, 64) + // for j (0, 4) + // A.new[i,j] = 0 + // for p (0, 16) + // A.new[i,j](32*p+32, 32*p) = A[i,j*16+p] + Array pack_shape = info_.target_shape; + pack_shape.push_back(info_.pack_factor); + Expr pack_index = FlattenIndices(indices, pack_shape); + Expr new_index = FlattenIndices(new_indices, info_.target_shape); + + // Pack + tranpose + // Expected output IR (example 512-packing) + // for i (0, 64) + // for j (0, 4) + // A.new[i,j] = 0 + // for p (0, 16) + // A.new[i,j](32*p+32, 32*p) = A[j*16+p,i] + if (info_.is_transpose) { + // Move last two iters to the front (i,(j,p)) to ((j,p),i). Left + // shifting + std::vector transpose_indices = {indices[1], indices[2], + indices[0]}; + pack_index = FlattenIndices(transpose_indices, pack_shape); + } + Expr load = + Load::make(type, old_var, pack_index, UIntImm::make(UInt(1), 1)); + Expr slice = + SetSlice::make(var, load, (1 + iter) * info_.pack_factor - 1, + iter * info_.pack_factor); + Stmt for_stmt = + Store::make(var, slice, new_index, UIntImm::make(UInt(1), 1)); + + auto for_type = ForType::Serial; + int bound = pack_shape.size(); + for (int j = bound - 1; j >= 0; j--) { + auto iter = loop_vars[j]; + for_stmt = For::make(VarExpr(iter.node_), 0, pack_shape[j], + for_type, DeviceAPI::None, for_stmt); + // Insert initialization store + if (j == bound - 1) { + Stmt init = + Store::make(var, 0, new_index, UIntImm::make(UInt(1), 1)); + for_stmt = Block::make(init, for_stmt); + } + } + body = Block::make(for_stmt, body); + + // Tensor transpose only + } else { + std::vector indices; + std::vector reverse_indices; + std::vector loop_vars; + for (size_t i = 0; i < origin_shape.size(); i++) { + VarExpr iter(name + ".transpose.r" + std::to_string(i)); + indices.push_back(iter); + reverse_indices.insert(reverse_indices.begin(), iter); + loop_vars.push_back(iter); + } + Expr reverse_index = FlattenIndices(reverse_indices, origin_shape); + Expr index = FlattenIndices(indices, origin_shape); + Expr load = + Load::make(type, old_var, index, UIntImm::make(UInt(1), 1)); + Stmt for_stmt = + Store::make(var, load, reverse_index, UIntImm::make(UInt(1), 1)); + + auto for_type = ForType::Serial; + for (size_t j = 0; j < origin_shape.size(); j++) { + auto iter = loop_vars[j]; + for_stmt = For::make(VarExpr(iter.node_), 0, origin_shape[j], + for_type, DeviceAPI::None, for_stmt); + } + body = Block::make(for_stmt, body); + HCL_DEBUG_LEVEL(2) << "[ debug ] tranpose loop for " << var; + HCL_DEBUG_LEVEL(2) << for_stmt; + } + + body = Allocate::make(var, type, info_.target_shape, + make_const(Bool(type.lanes()), true), body); + body = AttrStmt::make(var, attr::storage_scope, + StringImm::make("global"), body); + return ProducerConsumer::make(op->func, op->is_producer, body); + } + } + return IRMutator::Mutate_(op, s); + } + std::string target_producer_; + TransformInfo& info_; +}; + +class IndicesTransformer final : public IRMutator { + public: + IndicesTransformer(std::unordered_map& range, + std::vector& loop_iter_vars, TransformInfo& info) + : range_(range), loop_iter_vars_(loop_iter_vars), info_(info) {} + + // For AutoSA backend. Just inject the information without + // changing the IR + Stmt Mutate_(const ExternModule* op, const Stmt& s) { + has_autosa_module = true; + Expr value = this->Mutate(op->value); + Stmt body = this->Mutate(op->body); + auto annotate_keys = op->annotate_keys; + auto annotate_values = op->annotate_values; + + annotate_keys.push_back(StringImm::make(info_.name)); + string attr = info_.is_transpose ? "1" : "0"; + attr += "," + std::to_string(info_.pack_factor); + annotate_values.push_back(StringImm::make(attr)); + + return ExternModule::make(op->attr_key, value, body, annotate_keys, + annotate_values); + } + + // Mutate the function argument + Stmt Mutate_(const KernelDef* op, const Stmt& s) override { + has_autosa_module = false; + Stmt body = this->Mutate(op->body); + Array args; + Array> arg_shapes; + Array arg_types; + + for (size_t k = 0; k < op->args.size(); k++) { + auto name = op->args[k].get()->name_hint; + if (name == info_.name && !has_autosa_module) { + // Create arg with same node + VarExpr new_var(info_.name, info_.type); + args.push_back(new_var); + arg_shapes.push_back(info_.target_shape); + string type = Type2Str(info_.type); + arg_types.push_back(StringImm::make(type)); + } else { + args.push_back(op->args[k]); + arg_shapes.push_back(op->arg_shapes[k]); + arg_types.push_back(op->arg_types[k]); + } + } + has_autosa_module = false; + return KernelDef::make(args, arg_shapes, arg_types, op->arg_tensors, body, + op->ret_void, op->ret_type, op->name, + op->attributes); + } + + // Collect for loop information + Stmt Mutate_(const For* op, const Stmt& s) override { + range_[op->loop_var.get()] = Simplify(op->extent - 1); + loop_iter_vars_.push_back(op->loop_var); + Stmt stmt = IRMutator::Mutate_(op, s); + return stmt; + } + + Stmt Mutate_(const Store* op, const Stmt& s) { + string target_tensor_name_ = info_.name; + Array shape_ = info_.origin_shape; + if (target_tensor_name_ == op->buffer_var.get()->name_hint) { + info_.is_written = true; + if (info_.is_transpose) { + auto indices = ExtractIndices(op->index, shape_, range_); + std::reverse(indices.begin(), indices.end()); + auto new_index = FlattenIndices(indices, shape_); + return Store::make(op->buffer_var, op->value, new_index, op->predicate); + } + } + return IRMutator::Mutate_(op, s); + } + + Expr Mutate_(const Load* op, const Expr& e) { + string target_tensor_name_ = info_.name; + Array shape_ = info_.origin_shape; + + if (info_.is_transpose) { + if (target_tensor_name_ == op->buffer_var.get()->name_hint) { + auto indices = ExtractIndices(op->index, shape_, range_); + std::reverse(indices.begin(), indices.end()); + auto new_index = FlattenIndices(indices, shape_); + return Load::make(op->type, op->buffer_var, new_index, op->predicate); + } + } + return IRMutator::Mutate_(op, e); + } + + std::unordered_map& range_; + std::vector& loop_iter_vars_; + TransformInfo& info_; + bool has_autosa_module{false}; +}; + +// Insert new buffer before anchor (producer) stage +Stmt InsertReshapeBuffer(Stmt s, TransformInfo& info, + unordered_map& task_map_, + vector kernel_input_names) { + string producer = info.anchor_producer; + string tensor_name = info.name; + + CHECK(task_map_.count(producer)); + bool is_top_arg = false; + int arg_index = 0; + for (auto v : kernel_input_names) { + if (v == tensor_name) { + is_top_arg = true; + break; + } + arg_index++; + } + + // TODO(hecmay): handles on-chip data packing as well + if (is_top_arg) { + HCL_DEBUG_LEVEL(2) << " [ debug ] tensor " << tensor_name + << " is on top function interface"; + string target_producer = "test"; + TransformedBufferInserter tbi(target_producer, info); + return tbi.Mutate(s); + } + return s; +} + +// Update the buffer indices. If we want to +// tranpose, then reverse. Otherwise insert +// unpacking logic by default +Stmt UpdateBufferLayout(Stmt s, TransformInfo& info, + unordered_map& task_map_, + vector kernel_input_names) { + string producer = info.anchor_producer; + string tensor_name = info.name; + Stmt stmt = s; + + CHECK(task_map_.count(producer)); + bool is_top_arg = false; + int arg_index = 0; + for (auto v : kernel_input_names) { + if (v == tensor_name) { + is_top_arg = true; + break; + } + arg_index++; + } + + // Update buffer access indices and kernel + // function signature as well + if (is_top_arg) { + std::unordered_map range_; + std::vector loop_iter_vars_; + IndicesTransformer ivc(range_, loop_iter_vars_, info); + stmt = ivc.Mutate(stmt); + } else { + } + return stmt; +} + +// Collect tensor type and shape information +class TypeShapeCollector final : public IRMutator { + public: + TypeShapeCollector(Array& api_args) { + for (size_t i = 0; i < api_args.size(); i++) { + if (const Variable* v = api_args[i].as()) { + top_arg_names.insert(v->name_hint); + + } else if (auto buf = api_args[i].as()) { + CHECK(buf->data.as()); + top_arg_names.insert(buf->name); + shape_[buf->data.get()->name_hint] = buf->shape; + dtype_[buf->data.get()->name_hint] = buf->dtype; + HCL_DEBUG_LEVEL(2) << " [ collect shape ] " << buf->name; + } + } + } + + Stmt Mutate_(const Allocate* op, const Stmt& s) final { + auto v = op->buffer_var.get(); + auto name = v->name_hint; + // Save shape and dtype information + shape_[name] = op->extents; + dtype_[name] = op->type; + HCL_DEBUG_LEVEL(2) << " [ collect shape ] " << name; + return IRMutator::Mutate_(op, s); + } + + Stmt Mutate_(const KernelDef* op, const Stmt& s) { + for (size_t i = 0; i < op->args.size(); i++) { + string name = op->args[i].get()->name_hint; + auto shape = op->arg_shapes[i]; + shape_[name] = shape; + CHECK(op->arg_types[i].as()); + dtype_[name] = Str2Type(op->arg_types[i].as()->value); + HCL_DEBUG_LEVEL(2) << " [ collect shape ] " << name; + } + return IRMutator::Mutate_(op, s); + } + + Type Str2Type(string type_str) { + if (type_str.find("int") == 0) { + type_str.erase(0, 3); + int bits = std::atoi(type_str.c_str()); + return Int(bits); + } else if (type_str.find("uint") == 0) { + type_str.erase(0, 4); + int bits = std::atoi(type_str.c_str()); + return UInt(bits); + } else if (type_str.find("float") == 0) { + type_str.erase(0, 5); + int bits = std::atoi(type_str.c_str()); + return Float(bits); + } + return Int(32); + } + + unordered_set top_arg_names; + unordered_map> shape_; + unordered_map dtype_; +}; + +void CollectTypeShape(Stmt body, unordered_map>& shape, + unordered_map& dtype, + Array& api_args) { + HCL_DEBUG_LEVEL(2) << "---------- collect shape/dtype ---------"; + TypeShapeCollector tsc(api_args); + tsc.Mutate(body); + dtype = tsc.dtype_; + shape = tsc.shape_; +} + +// Check all the tensors in the Stmt. Get information +// of their access pattern (write_only, read_only or read_write) +class BufferStatusCollector : public ir::IRMutator { + public: + Stmt Mutate_(const Store* op, const Stmt& s) { + Stmt stmt = IRMutator::Mutate_(op, s); + op = stmt.as(); + if (!local_buffers.count(op->buffer_var.get())) { + updated_tensors.insert(op->buffer_var.get()); + } + return stmt; + } + + Expr Mutate_(const Load* op, const Expr& e) { + auto name = op->buffer_var.get()->name_hint; + if (!local_buffers.count(op->buffer_var.get())) { + input_tensors.insert(op->buffer_var.get()); + } + return IRMutator::Mutate_(op, e); + } + + Stmt Mutate_(const Allocate* op, const Stmt& s) { + local_buffers.insert(op->buffer_var.get()); + Stmt stmt = IRMutator::Mutate_(op, s); + op = stmt.as(); + return stmt; + } + + unordered_set local_buffers; + unordered_set updated_tensors; + unordered_set input_tensors; +}; + +// Each task in the graph represents the logic performed by +// a HCL stage. The task graph is a coarse grained DFG. +// There is no no control flow branching across different tasks +class TaskGraphBuilder : public IRMutator { + public: + explicit TaskGraphBuilder(Array api_args) {} + + Stmt Mutate_(const KernelDef* op, const Stmt& s) { + if (op->name == "test") { + device_scope_ = true; + + // Save the input tensors + for (auto& v : op->args) { + kernel_input_args.push_back(v); + } + Stmt body = this->Mutate(op->body); + device_scope_ = false; + return KernelDef::make(op->args, op->arg_shapes, op->arg_types, + op->arg_tensors, body, op->ret_void, op->ret_type, + op->name, op->attributes); + } else { + return IRMutator::Mutate_(op, s); + } + } + + Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) { + if (top_level_producer_ && device_scope_) { + top_level_producer_ = false; + Stmt body = this->Mutate(op->body); + if (op->is_producer) { + std::string name = op->func->func_name(); + + // Create a task node in the graph + BufferStatusCollector bsc; + bsc.Mutate(op->body); + TaskNode task = {name, bsc.updated_tensors, bsc.input_tensors, {}, {}}; + + // Checking depending input tensors + Array kernel_input_vars; + for (auto& input : kernel_input_args) { + Var v(input.node_); + kernel_input_vars.push_back(v); + } + Array undefs = UndefinedVars(body, kernel_input_vars); + // The task can be a producer of a tensor, or it will just update + // a set of tensors. If the input tensor is not defined in this task + // nor in the input arguments, then it must have been defined in the + // previous tasks visited in the traversal + for (auto& var : undefs) { + auto parents = checkTensorLiveness(var.get()); + for (auto& parent_task_name : parents) { + task.parents.insert(parent_task_name); + CHECK(task_map.count(parent_task_name)); + task_map[parent_task_name].children.insert(name); + } + } + task_map[name] = task; + HCL_DEBUG_LEVEL(2) << "[ debug ] producing tensor " << name; + } + top_level_producer_ = true; + return ProducerConsumer::make(op->func, op->is_producer, body); + } + return IRMutator::Mutate_(op, s); + } + + // Return the nearest parent task that a tensor has been updated + vector checkTensorLiveness(const Variable* var) { + // Tasks where the tensor has been updated + vector parents; + for (auto& kv : task_map) { + for (auto& t : kv.second.updated_tensors) { + if (t == var) { + HCL_DEBUG_LEVEL(2) << "[ debug ] Tensor " << var->name_hint + << " has been updated in task " << kv.second.name; + parents.push_back(kv.second.name); + } + } + } + return parents; + } + + // Check the task graph inside the device scope + bool device_scope_{false}; + bool top_level_producer_{true}; + // Input tensor to the top level function + Array kernel_input_args; + // Map from task name to TaskNode + unordered_map task_map; +}; + +// 1. Locate the which tensor (in which stage) will be layout transformed +// 2. Locate its parent task and insert the layout mutation statements +class LayoutTransformer : public IRMutator { + public: + explicit LayoutTransformer(unordered_map& task_map, + Array& api_args, + vector kernel_inputs) + : task_map_(task_map), + api_args_(api_args), + kernel_inputs_(kernel_inputs) {} + + unordered_map& task_map_; + Array& api_args_; + unordered_map> shape_; + unordered_map dtype_; + vector kernel_inputs_; + + std::string current_producer; + // Map from producer key to target tensor name + unordered_map worklist; + + Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) { + current_producer = op->func->func_name(); + return IRMutator::Mutate_(op, s); + } + + Stmt Mutate_(const AttrStmt* op, const Stmt& s) { + // The tensor to be transformed + if (op->attr_key == attr::tensor_layout_attrs) { + VarExpr var(op->node.node_); + auto name = var.get()->name_hint; + CHECK(shape_.count(name)) << name; + CHECK(dtype_.count(name)) << name; + + size_t pos = 0; + string delimiter = ":"; + string token; + Array target_shape; + + CHECK(op->value.as()); + string s(op->value.as()->value); + + int target_total_width = 1; + while ((pos = s.find(delimiter)) != string::npos) { + token = s.substr(0, pos); + target_shape.push_back(std::stoi(token)); + s.erase(0, pos + delimiter.length()); + target_total_width *= std::stoi(token); + } + target_total_width *= std::stoi(s); + target_shape.push_back(std::stoi(s)); + + // Check tranform type (tranpose or packing) + int origin_total_width = 1; + for (auto& dim : shape_.at(name)) { + CHECK(dim.as()); + origin_total_width *= dim.as()->value; + } + + // TODO(Hecmay): handle reshape + if (origin_total_width == target_total_width) { + HCL_DEBUG_LEVEL(2) << "[ debug ] Transpose layout of tensor " << name + << "(" << shape_[name] << ") to (" << target_shape + << ")"; + + CHECK(dtype_.count(name)); + TransformInfo info = {name, + var, + current_producer, + shape_[name], + target_shape, + dtype_[name], + dtype_[name], + true, + false, + 1, + false}; + + if (!worklist.count(name)) { + worklist[name] = info; + // The tensor has been packed + // Recalculate the packing shape + } else { + Array new_shape; + int shape_size = info.target_shape.size(); + for (int k = 0; k < shape_size; k++) { + if (k == shape_size - 1) { + int factor = worklist[name].pack_factor; + int new_dim = info.target_shape[k].as()->value; + new_dim /= factor; + new_shape.push_back(new_dim); + } else { + new_shape.push_back(info.target_shape[k]); + } + } + worklist[name].target_shape = new_shape; + worklist[name].is_transpose = true; + } + + } else { + // Pack the last dimension by default + int pack_factor = origin_total_width / target_total_width; + HCL_DEBUG_LEVEL(2) << "[ debug ] Pack layout of tensor " << name << "(" + << shape_[name] << ") to (" << op->value << ")"; + + Type new_type = Int(dtype_[name].bits() * pack_factor); + TransformInfo info = {name, var, current_producer, + shape_[name], target_shape, dtype_[name], + new_type, false, true, + pack_factor, false}; + + if (!worklist.count(name)) { + worklist[name] = info; + // if the target has been transposed + // first tranpose and then data-packing + } else { + Array new_shape; + int shape_size = worklist[name].target_shape.size(); + for (int k = 0; k < shape_size; k++) { + if (k == shape_size - 1) { + int factor = pack_factor; + int new_dim = worklist[name].target_shape[k].as()->value; + new_dim /= factor; + new_shape.push_back(new_dim); + } else { + new_shape.push_back(worklist[name].target_shape[k]); + } + } + worklist[name].target_shape = new_shape; + worklist[name].type = Int(pack_factor * worklist[name].type.bits()); + worklist[name].is_pack = true; + worklist[name].pack_factor = pack_factor; + } + } + + return this->Mutate(op->body); + } + return IRMutator::Mutate_(op, s); + } + + Stmt Transform(Stmt s) { + CollectTypeShape(s, shape_, dtype_, api_args_); + Stmt stmt = this->Mutate(s); + // Process the worklist one by one + for (auto& kv : worklist) { + auto tensor_name = kv.first; + auto& info = kv.second; + auto producer_name = info.anchor_producer; + CHECK(shape_.count(tensor_name)) << tensor_name; + + string status = "Processing tensor " + tensor_name + "(pack:"; + status += info.is_pack ? std::to_string(info.pack_factor) : "no"; + status += ", transpose:"; + status += info.is_transpose ? "yes)" : "no)"; + + HCL_DEBUG_LEVEL(2) << "--------------"; + HCL_DEBUG_LEVEL(2) << "[ INFO ] " << status << ". shape " + << info.target_shape << ", type " << info.type; + + VarExpr new_buf(tensor_name + ".new"); + HCL_DEBUG_LEVEL(2) << " [ debug ] transform layout of tensor " + << tensor_name << " from stage " << producer_name; + + // Mutate tensor access indices from all children stages + stmt = UpdateBufferLayout(stmt, info, task_map_, kernel_inputs_); + // Insert new buffer before anchor stage + CHECK(task_map_.count(producer_name)); + stmt = InsertReshapeBuffer(stmt, info, task_map_, kernel_inputs_); + } + return stmt; + } +}; + +Stmt TransformLayout(Stmt stmt, Array api_args) { + // Restore the task graph from the IR + HCL_DEBUG_LEVEL(2) << "------------ Transform Layout --------------"; + TaskGraphBuilder tgb(api_args); + stmt = tgb.Mutate(stmt); + + // Iterate thru tensors in worklist (to be transposed or packed) + vector kernel_inputs; + for (auto& arg : tgb.kernel_input_args) { + kernel_inputs.push_back(arg.get()->name_hint); + } + LayoutTransformer ltm(tgb.task_map, api_args, kernel_inputs); + stmt = ltm.Transform(stmt); + return stmt; +} + +} // namespace ir +} // namespace TVM diff --git a/tvm/src/schedule/schedule_dataflow_rewrite.cc b/tvm/src/schedule/schedule_dataflow_rewrite.cc index 57c06da55..dde48d605 100644 --- a/tvm/src/schedule/schedule_dataflow_rewrite.cc +++ b/tvm/src/schedule/schedule_dataflow_rewrite.cc @@ -175,6 +175,155 @@ class InfoUpdater final : public IRMutator { const bool is_sender_; }; +void Schedule::transform_layout(Stage parent, const Tensor& target, + Array shape) { + // Locate the stage + Stage target_stage = (*this)[target]; + if (auto op = parent->op.as()) { + std::string shape_str = ""; + std::string delim = ""; + for (auto& dim : shape) { + CHECK(dim.as()); + shape_str += delim + std::to_string(dim.as()->value); + delim = ":"; + } + Stmt new_body = + AttrStmt::make(VarExpr(target->op->name), attr::tensor_layout_attrs, + StringImm::make(shape_str), op->body); + + parent->op = ExternOpNode::make(op->name, op->tag, op->axis, op->inputs, + op->input_placeholders, + op->output_placeholders, new_body); + } +} + +// Create multiple stages attached to the original parent stage +Array Schedule::explicit_unroll(const Tensor& target, + const Array axes, + bool autosa) { + // Locate the stage + Stage target_stage = (*this)[target]; + Array ret_tensors; + + // The stage to be explicitly unrolled + Buffer target_buffer; + auto op = target_stage->op.as(); + CHECK(op); + target_buffer = op->output_placeholders[0]; + ArrayNode* stages = (*this)->stages.CopyOnWrite(); + size_t pos = FindNodeRef(stages, target_stage); + + // Unroll the loops explicitly + // 1. Create sub-stages and output buffers + // 2. Return new body for parent stage with attaching anchors + CHECK_GT(axes.size(), 0); + + // Update the dataflow graph + // 1. The parent (original) stage has new inputs + // 2. The newly created stages output to parent, and takes parent inputs + auto parent_new_inputs = op->inputs; + auto parent_new_input_placeholders = op->input_placeholders; + + // Assume the outer axis precedes the inner ones + // Create PE substage array (1D flattened) + std::unordered_map pe_row_number; + std::vector stage_buffers; + std::string unrolled_axes = ""; + std::string delim = ""; + + // TODO(hecmay): support more than 2 level + for (int level = axes.size() - 1; level >= 0; level--) { + auto& axis = axes[level]; + auto min = axis->dom->min.as()->value; + auto extent = axis->dom->extent.as()->value; + HCL_DEBUG_LEVEL(2) << "[ unrolling ] loop No. " << level << " range(" << min + << "," << extent << ")"; + + int row_index = axes.size() - level - 1; + pe_row_number[row_index] = extent - min; + + // innermost loop unrolling + for (int k = min; k < extent; k++) { + int replicate_times = 1; + // replicate unrolled inner-level PEs + for (auto& kv : pe_row_number) { + if (kv.first < row_index) { + replicate_times *= kv.second; + } + } + + for (int r = 0; r < replicate_times; r++) { + std::string new_name; + if (axes.size() == 1) { + new_name = target->op->name + "_pe_" + std::to_string(k); + } else if (axes.size() == 2) { + if (row_index == 0) break; + new_name = target->op->name + "_pe_" + std::to_string(k) + "_" + + std::to_string(r); + } + + Array new_inputs = op->inputs; + Array new_input_placeholders = op->input_placeholders; + Array new_output_placeholders; + + // Create op buffer node for new stage + Buffer new_output_buf = + BufferNode::make(Var(new_name, Handle()), Int(32), Array(), + Array(), Expr(), new_name, "", 0, 0); + new_output_placeholders.push_back(new_output_buf); + stage_buffers.push_back(new_output_buf); + + // Create new body for the PE + Stmt body = AttrStmt::make( + VarExpr(new_name), "kernel_scope", StringImm::make(new_name), + // Evaluate::make(1)); + Evaluate::make(Call::make(Int(32), "pe", {}, Call::Intrinsic))); + + // Create extern op node for the stage + auto new_op = ExternOpNode::make(new_name, "", Array(), + new_inputs, new_input_placeholders, + new_output_placeholders, body); + HCL_DEBUG_LEVEL(2) << "[ debug ] unrolling pe " << new_name + << " body: " << body; + + // Insert the output tensor + ret_tensors.push_back(new_op.output(0)); + parent_new_inputs.push_back(new_op.output(0)); + parent_new_input_placeholders.push_back(new_output_buf); + + // Add stage into the DFG + Stage new_stage(new_op); + stages->data.insert(stages->data.begin() + pos, new_stage.node_); + (*this)->stage_map.Set(new_op, new_stage); + } + } + unrolled_axes += delim + axis->var.get()->name_hint; + delim = ","; + } + + // Use original op in case that the stage has been tiled + auto origin_op = target_stage->origin_op.as(); + CHECK(origin_op); + + // Update parent ops and body (set of attaching anchors) + Stmt new_body = origin_op->body; + Array annotate_keys = {StringImm::make("unroll")}; + Array annotate_values = {StringImm::make(unrolled_axes)}; + std::string key = (autosa) ? "autosa" : "systolic"; + new_body = ExternModule::make(key, StringImm::make("HLS"), new_body, + annotate_keys, annotate_values); + + std::string parent_name = target->op->name; + for (auto& buffer : stage_buffers) { + new_body = AttrStmt::make(VarExpr(buffer.node_), "attach_scope", + StringImm::make(parent_name), new_body); + } + target_stage->op = ExternOpNode::make( + op->name, op->tag, op->axis, parent_new_inputs, + parent_new_input_placeholders, op->output_placeholders, new_body); + return ret_tensors; +} + // Initialize static channel count int InfoUpdater::channelCount = 0; diff --git a/tvm/src/schedule/schedule_lang.cc b/tvm/src/schedule/schedule_lang.cc index f5d304dbc..da8b32e07 100644 --- a/tvm/src/schedule/schedule_lang.cc +++ b/tvm/src/schedule/schedule_lang.cc @@ -261,6 +261,21 @@ void CreateStencil(StageNode* stage, int burst_width, int unroll_factor, op->input_placeholders, op->output_placeholders, body); } +void CreateSystolic(StageNode* stage) { + const ExternOpNode* op = stage->op.as(); + Array annotate_keys, annotate_values; + // Create an extern module to wrap the AutoSA generated HLS code + Stmt body = ExternModule::make("autosa", StringImm::make("HLS"), op->body, + annotate_keys, annotate_values); + stage->op = ExternOpNode::make(op->name, + op->tag, + op->axis, + op->inputs, + op->input_placeholders, + op->output_placeholders, + body); +} + void CreateDataflow(StageNode* stage, IterVar var) { const ExternOpNode* op = stage->op.as(); Stmt body; @@ -493,6 +508,11 @@ Stage& Stage::stencil(int burst_width, int unroll_factor, int num_iteration) { return *this; } +Stage& Stage::systolic() { // NOLINT(*) + CreateSystolic(operator->()); + return *this; +} + Stage& Stage::dataflow(IterVar var) { CreateDataflow(operator->(), var); return *this;