diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 7f61b83c9..a00a306ea 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -26,5 +26,5 @@ jobs:
           source $VITIS/settings64.sh
           source /opt/xilinx/xrt/setup.sh
           export LOCAL_CI_TEST=1
-          which vivado_hls
-          python tests/test_cont_integration.py
+          export AUTOSA=/work/shared/common/AutoSA
+          python tests/test_continuous_integration.py
diff --git a/python/heterocl/autosa.py b/python/heterocl/autosa.py
new file mode 100644
index 000000000..86b492971
--- /dev/null
+++ b/python/heterocl/autosa.py
@@ -0,0 +1,292 @@
+import re
+import os
+import copy
+import sys
+import time
+from .util import run_process
+from .devices import Project, Platform
+
+# Static class for entries of each SA module
+class SystolicArrayRegistry(object):
+    sa_module_cnt = 0
+
+def count_SA_size(code):
+    pos = code.rfind("PE_wrapper")
+    function = code[pos:pos+100]
+    dims = re.findall(" (\d+),", function)
+    if len(dims) < 2:
+        print("Failed to generate 2d SA. Size", dims)
+        sys.exit()
+
+    dimX, dimY = int(dims[0])+1, int(dims[1])+1
+    print(f"[  INFO  ] generating SA dimnesion {dimX}x{dimY}.")
+
+def indent(num):
+    return " " * num
+
+def get_function_code(name, code):
+    pos = code.find(name)
+    start_pos = pos - len("inline void")
+    end_pos = code.find("/* Helper", pos)
+    return code[start_pos:end_pos]
+
+
+def get_ser_size(code):
+    lines = code.split("\n")
+    pattern = "<= (\d+);"
+    size = 1
+    for line in lines:
+        rets = re.findall(pattern, line)
+        if len(rets) > 0:
+            assert len(rets) == 1
+            size *= (int(rets[0])+1)
+        else: continue
+    return size
+
+def insert_data_pack(ret_code, header, off_chip_data, written_data):
+    ret_code = ret_code.replace("buffer_", "").replace("[0]", "")
+    # Extract the designated data types
+    pattern = re.findall("autosa_func\((.*?)\)", ret_code)[0]
+    args = pattern.split(", ")
+    signature = re.findall("autosa_func\((.*?)\);", header)
+
+    # If the arg is accessed from off-chip memory, then we repalce the typedef 
+    # with target packed data type
+    types = signature[0].split(", ")
+    for t in types:
+        for arg in args:
+            if arg in t:
+                pattern = "_t(\d+) "
+                target_type = re.findall(pattern, t)[0]
+                target_type_bits = int(target_type) * 32
+                # Off-chip coalesced data access
+                if arg in off_chip_data:
+                    header = f"#undef {arg}_t\n#define {arg}_t ap_uint<{target_type_bits}>\n" + header
+                
+                # Insert data packing and (de)serialization
+                # Create a new buffer and reshape it to original buffer after or before AutoSA func call
+                else:
+                    if arg in written_data:
+                        print(f"[ INFO ] Writing to on-chip memory {arg}. Packed into ap_uint<{target_type_bits}>...")
+                        # ALlocate new buffer and perform data deserialization
+                        deser_func = f"host_deserialize_{arg}"
+                        # Check if the size matches
+                        code = get_function_code(deser_func, header)
+                        size = get_ser_size(code)
+                        ret_code = ret_code.replace(arg, f"{arg}_sa")
+                        ret_code = f"float {arg}_sa[{size}];\n" + indent(5) + ret_code + \
+                            indent(6) + f"{deser_func}({arg}, {arg}_sa);\n"
+                    else:
+                        pass
+
+    return ret_code, header
+
+# Update HLS function names in the generated Extern IP core
+def add_prefix(header, ret_code):
+    # Preserved function keywords in AutoSA generated code
+    function_list = [
+        "autosa_func", "PE_wrapper", "PE"
+    ]
+    index = SystolicArrayRegistry.sa_module_cnt
+    for f in function_list:
+        header = header.replace(f"{f}(", f" inst{index}_{f}(")
+        ret_code = ret_code.replace(f"{f}(", f" inst{index}_{f}(")
+    SystolicArrayRegistry.sa_module_cnt += 1
+    return header, ret_code
+
+def infer_default_params(loop_bounds):
+    assert len(loop_bounds) > 1, loop_bounds
+    extra_flags = "--simd-info=./autosa_tests/mm_hcl/simd_info.json "
+    # Params for MatMul
+    if len(loop_bounds) == 3:
+        loop_bounds = [ int(_) for _ in loop_bounds ]
+        m, n, k = loop_bounds
+        if m > 1 and n > 1 and k > 1:
+            ST = 3
+            SA_dim_x = 4
+            SA_dim_y = 4
+            PART = f"{m},{n},{k}"
+            if m > 256 or n > 256 or k > 256: LAT = [16,16]
+            else: LAT = [ int(m/SA_dim_x), int(n/SA_dim_y) ]
+            LAT = [ str(1) if _ == 0 else str(_) for _ in LAT ]
+            LAT = ",".join(LAT)
+            SIMD = k if k <= 8 else 4
+        # Map reduction loop to space dim
+        else:
+            ST = 2
+            PART = "10,16"
+            LAT = "2,2"
+            SIMD = 4
+            extra_flags += "--local-reduce --reduce-op=\"+\" --simd-touch-space "
+
+    # Params for Conv
+    else:
+        OC, OH, OW, IC, R, C = loop_bounds
+        ST = 4
+        print(f"[  INFO  ] input size OC({OC}), OH({OH}), OW({OW}), IC({IC}), R({R}), C({C})")
+        PART = "16,13,13,1"
+        LAT  = "2,1,2"
+        SIMD = "1,1,2,4"
+        extra_flags = "--simd-info=./autosa_tests/cnn/simd_info.json "
+    return ST, PART, LAT, SIMD, extra_flags
+
+def generate_systolic_array(keys, values, code, backend):
+    # Analyze packing and transpose information
+    input_attr_info = dict()
+    packed_data = list()
+    transposed_data = list()
+
+    is_axis_enabled = False
+    loop_bounds = list()
+    off_chip_data = list()
+    written_data = list()
+
+    # Process attribute information for AutoSA module
+    for index in range(len(keys)):
+        key = keys[index].value
+        if key == "axis":
+            is_axis_enabled = True
+            continue
+        elif key == "loop_bound":
+            loop_bounds = values[index].value.split(",")
+        elif key == "tensor_placement":
+            info = values[index].value.split(",")
+            for var in info:
+                var_name = var.replace("[0]", "").replace("[1]", "")
+                var_name = var_name.replace("[read]", "").replace("[write]", "")
+                if "[0]" in var:
+                    off_chip_data.append(var_name)
+                if "[write]" in var:
+                    written_data.append(var_name)
+        else:
+            try:
+                is_transpose, pack_factor = values[index].value.split(",")
+                input_attr_info[var] = [int(is_transpose), int(pack_factor)]
+                if int(pack_factor) > 0:
+                    packed_data.append(var)
+                if int(is_transpose) == 1:
+                    transposed_data.append(var)
+            except:
+                pass
+ 
+    instance = SystolicArrayRegistry.sa_module_cnt
+    autosa_c_source = f"hcl_autosa_tmp_inst{instance}.c"
+    pwd = os.getcwd()
+    with open(autosa_c_source, "w") as fp:
+        fp.write("#include <stdio.h>\n")
+        fp.write("int main(int argc, char **argv) {\n")
+        fp.write(code)
+        fp.write("}")
+        
+    header = "#include <autosa.h>\n"
+    ret_code = "autosa_func(args);\n"
+
+    # check autosa installation
+    autosa_dir = ""
+    try:
+        autosa_dir = os.environ["AUTOSA"]
+    except:
+        print("[{}] WARNING: AutoSA not found. Please setup env variable AUTOSA".format(time.strftime("%H:%M:%S", time.gmtime())))
+    if autosa_dir == "":    
+        ret_code = "// Not found AutoSA. returns function placeholder\n" + indent(6) + ret_code    
+        header = ""
+        return [header, ret_code]  
+
+    source_path = os.path.join(pwd, autosa_c_source)
+    cmd = "cd {}; ".format(autosa_dir)
+    cmd += "./autosa "
+    cmd += "{} ".format(source_path)
+    cmd += "--config=./autosa_config/autosa_config.json "
+    if backend == "vhls":
+        cmd += "--target=autosa_hls_c "
+    elif backend == "aocl":
+        cmd += "--target=autosa_opencl "
+    else:
+        raise RuntimeError(f"Illegal backend {backend}")
+    cmd += "--output-dir=./autosa.tmp/output "
+
+    # Get the default value
+    ST, PART, LAT, SIMD, extra_flags = infer_default_params(loop_bounds)
+    # Internal debugging interface to set up the params
+    sa_space_time = os.getenv("SA_SPACE_TIME", ST)
+    sa_array_part = os.getenv("SA_ARRAY_PAR", PART)
+    sa_lat_hiding = os.getenv("SA_LAT_HIDING", LAT)
+    sa_simd = os.getenv("SA_SIMD", SIMD)
+
+    print(f"[  INFO  ] AutoSA params: Array partition {sa_array_part}. Latency hiding {sa_lat_hiding}. SIMD{sa_simd}")
+    cmd += "--sa-sizes=\"{{kernel[]->space_time[{}];".format(sa_space_time)
+    cmd += "kernel[]->array_part[{}];".format(sa_array_part)
+    cmd += "kernel[]->latency[{}];".format(sa_lat_hiding)
+    cmd += "kernel[]->simd[{}]".format(sa_simd)
+    cmd += "}\" " 
+        
+    cmd += "--hls "
+    cmd += "--hcl "
+    if is_axis_enabled:
+        pass # cmd += "--axi-stream "
+
+    # configure data packing
+    if backend == "vhls":
+        data_pack_config = ""
+        if len(packed_data) > 0:
+            data_pack_config = "--data-pack-sizes=\"{"
+            delim = ""
+            for var in packed_data:
+                data_pack_config += delim + "kernel[]->{}[8,32,64]".format(var) 
+                delim = ";"
+            data_pack_config += "}\" "
+
+    if data_pack_config == "":
+        data_pack_config = "--no-data-pack "
+    cmd += data_pack_config
+    cmd += extra_flags
+
+    # addiitonal flags for intel ocl
+    if backend == "aocl":
+        cmd += "--loop-infinitize --double-buffer-style=0 "
+
+    # Add serialization if the SA module has interface arguements
+    # cmd += "--host-serialize "
+    print(f"[  INFO  ] AutoSA command {cmd}")
+
+    # Save autosa command for debugging purposes
+    with open(f"hcl_autosa_cmd_inst{instance}.sh", "w") as fp:
+        fp.write(cmd)
+    run_process(cmd)
+    
+    # Extract the autosa generated code
+    if backend == "vhls": autosa_header = f"hcl_autosa_tmp_inst{instance}_hcl_decl.h"
+    else: autosa_header = "hcl_autosa_tmp_kernel.h"
+
+    ext = "cpp" if backend == "vhls" else "cl"
+    source_file = f"{autosa_dir}/autosa.tmp/output/src/hcl_autosa_tmp_inst{instance}_kernel.{ext}"
+    with open(source_file, "r") as fp:
+        header = fp.read() + "\n"
+        header = header.replace(f"#include \"{autosa_header}\"", "")
+
+        if backend == "aocl":
+            # Also extract the helper functions for data serialization and deserialization
+            with open(f"{autosa_dir}/autosa.tmp/output/src/hcl_autosa_tmp_host.h", "r") as f:
+                content = f.read()
+                annotation = "/* Helper Function */"
+                start_pos = content.find(annotation)
+                end_pos = content.rfind(annotation) + len(annotation)
+                header += content[start_pos:end_pos] + "\n"
+        
+        # For xilinx HLS backend
+        else:
+            count_SA_size(header)
+
+    # External module call inside top function
+    with open(f"{autosa_dir}/autosa.tmp/output/src/{autosa_header}", "r") as fp:
+        ret_code = fp.readlines()[0].strip() + ";\n"
+
+    # Add prefix to SA functions
+    header, ret_code = add_prefix(header, ret_code)
+
+    # Bitcasting the input arguments (to AutoSA selected bit-packing factor)
+    # 1. Substitute data type (interface arg) is decided by AutoSA (and possibly do some extra padding).
+    # 2. Substitute data serialization size and intrinsic
+    ret_code, header = insert_data_pack(ret_code, header, off_chip_data, written_data)
+
+    return [ header, ret_code ] 
\ No newline at end of file
diff --git a/python/heterocl/schedule.py b/python/heterocl/schedule.py
index 2a2eaa031..a05d11c5f 100644
--- a/python/heterocl/schedule.py
+++ b/python/heterocl/schedule.py
@@ -265,6 +265,61 @@ def join(self, srcs, dest=None):
                         "inconsistent tensor joining"
             self.sch.join(target, dest, self[src])
 
+    def transpose(self, tensor=None):
+        """ transpose a tensor """
+        if tensor is not None:
+            src = None
+            if isinstance(tensor, tuple):
+                src, tensor = tensor
+                src = self.__getitem__(src)
+            else:
+                src = self.__getitem__(tensor)
+                tensor = tensor.tensor
+            try:
+                shape = [ int(_.value) for _ in tensor.shape ]
+            except: 
+                shape = [ int(_) for _ in tensor.shape ]
+
+            target_shape = shape[::-1]
+            self.cascade_tensor = tensor
+            self.cascade_source_stage = None
+            self.sch.transpose(src, tensor, target_shape)
+        return self
+
+    def pack(self, tensor=None, factor=512):
+        """ pack data for data transfer """
+        if isinstance(tensor, list):
+            for t in tensor:
+                ret = self.pack(t, factor=factor)
+            return self
+
+        if tensor is not None:
+            if isinstance(tensor, tuple):
+                src, tensor = tensor
+                src = self.__getitem__(src)
+            else:
+                src = self.__getitem__(tensor)
+                tensor = tensor.tensor
+
+            try:
+                shape = [ int(_.value) for _ in tensor.shape ]
+            except: 
+                shape = [ int(_) for _ in tensor.shape ]
+            bits = types.get_bitwidth(tensor.dtype)
+            # Calculate target shape
+            new_shape = [1]
+            for index in range(len((shape))):
+                index = len(shape)-index-1
+                bits *= shape[index]
+                if bits > factor:
+                    new_shape = shape[:index] + [ int(bits/factor) ]
+                    break
+
+            self.cascade_tensor = tensor
+            self.cascade_source_stage = None
+            self.sch.transpose(src, tensor, new_shape)
+
+        return self
 
     def to(self, tensor, dst=None, src=None, axis=0,
            mode=_expr.IO.DMA, fifo_depth=1, burst_len=-1):
diff --git a/python/heterocl/tvm/build_module.py b/python/heterocl/tvm/build_module.py
index 30d417ee8..e5ede8bfc 100755
--- a/python/heterocl/tvm/build_module.py
+++ b/python/heterocl/tvm/build_module.py
@@ -380,6 +380,9 @@ def lower(sch,
     stmt = ir_pass.AdjustBufferBinding(stmt, arg_list)
     stmt = ir_pass.InferStream(stmt, arg_list)
     stmt = ir_pass.AdjustBufferBinding(stmt, arg_list)
+    # perform layout transformation
+    stmt = ir_pass.TransformLayout(stmt, arg_list)
+    stmt = ir_pass.AdjustBufferBinding(stmt, arg_list)
     for f in lower_phase3:
         stmt = f(stmt)
     if simple_mode:
diff --git a/python/heterocl/tvm/runtime.py b/python/heterocl/tvm/runtime.py
index e6363f25d..9624afe1a 100644
--- a/python/heterocl/tvm/runtime.py
+++ b/python/heterocl/tvm/runtime.py
@@ -2,8 +2,8 @@
 import os, subprocess, time, re, glob
 from ..report import parse_xml
 from ..devices import Project
-
-debug = True
+from ..autosa import generate_systolic_array
+from ..util import run_process
 
 def find_path(path, fname):
     file_dir = []
@@ -30,16 +30,6 @@ def replace_text(f_name, prev, new):
     with open(f_name, 'w') as fp:
         fp.write(data)
 
-def run_process(cmd, pattern=None, env=None):
-    if debug: print("[DEBUG] Running commands: \n{}\n".format(cmd))
-    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
-    out, err = p.communicate()
-    if err: raise RuntimeError("Error raised: ", err.decode())
-    if pattern: return re.findall(pattern, out.decode("utf-8"))
-    if debug: 
-        print("[DEBUG] Commands outputs: \n{}\n".format(out.decode("utf-8")))
-    return out.decode("utf-8")
-
 @register_func
 def exec_init(dev_hash, tool, mode):
     # check whether pre-compiled bitstream exitsts
@@ -74,6 +64,19 @@ def exec_init(dev_hash, tool, mode):
 
 @register_func
 def process_extern_module(attr_key, annotate_keys, annotate_values, code):
+    if attr_key == "soda":
+        pos = code.find("#include")
+        code = code[pos:]
+        code = code.replace("extern \"C\" {", "")
+        code = code.replace("}  // extern \"C\"", "")
+        func_call = ""
+        return [code, func_call] 
+
+    # process the AutoSA input HLS code (string)
+    elif attr_key == "autosa":
+        backend = "vhls"
+        return generate_systolic_array(annotate_keys, annotate_values, code, backend)
+
     header, body = "", ""
     if attr_key == "vhls":
         kernel_name = ""
diff --git a/python/heterocl/tvm/schedule.py b/python/heterocl/tvm/schedule.py
index 396bbb7ab..ddb02a97d 100644
--- a/python/heterocl/tvm/schedule.py
+++ b/python/heterocl/tvm/schedule.py
@@ -9,6 +9,7 @@
 from . import expr as _expr
 from . import stmt as _stmt
 from . import container as _container
+import os
 
 @register_node
 class Buffer(NodeBase):
@@ -256,6 +257,9 @@ def reuse_at(self, target, parent, axis, name):
     def partition(self, target, partition_type, dim, factor):
         return _api_internal._SchedulePartition(self, target, dim, factor, partition_type)
 
+    def transpose(self, src, tensor, target_shape):
+        return _api_internal._TransformLayout(self, src, tensor, target_shape) 
+
     # Create separate python functions for data movement FFIs
     # Move a stage's loop body to device
     def in_stage_move(self, target, dst, src, axis=0, 
@@ -542,7 +546,7 @@ def parallel(self, var):
         if isinstance(var, int):
             var = self.op.axis[var]
         _api_internal._StageParallel(self, var)
-    
+
     def dataflow(self, var=None):
         """Create dataflow region inside loop or function body
 
@@ -574,6 +578,11 @@ def pipeline(self, var, initiation_interval=1):
     def stencil(self, burst_width=512, unroll_factor=1, num_iteration=1):
         _api_internal._StageStencil(self, burst_width, unroll_factor, num_iteration)
 
+    def systolic(self, **kwargs):
+        for key, value in kwargs.items():
+            os.environ[key] = value
+        _api_internal._StageSystolic(self)
+
     def pragma(self, var, pragma_type):
         """Annotate the iteration with pragma
 
diff --git a/python/heterocl/util.py b/python/heterocl/util.py
index 9ef069007..13eed9356 100644
--- a/python/heterocl/util.py
+++ b/python/heterocl/util.py
@@ -9,6 +9,7 @@
 from .scheme import Scheme
 from .debug import DTypeError
 from .mutator import Mutator
+import subprocess, re
 
 class VarName():
     """A counter for each type of variables.
@@ -141,3 +142,13 @@ def mutate_BinOp(self, binop, node):
 
     def mutate_Cast(self, node):
         return self.mutate(node.value)
+    
+def run_process(cmd, pattern=None, env=None, debug=True):
+    if debug: print("[DEBUG] Running commands: \n{}\n".format(cmd))
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
+    out, err = p.communicate()
+    if err: raise RuntimeError("Error raised: ", err.decode())
+    if pattern: return re.findall(pattern, out.decode("utf-8"))
+    if debug: 
+        print("[DEBUG] Commands outputs: \n{}\n".format(out.decode("utf-8")))
+    return out.decode("utf-8")
diff --git a/samples/gemm/gemm_systolic.py b/samples/gemm/gemm_systolic.py
new file mode 100644
index 000000000..dddd1dc79
--- /dev/null
+++ b/samples/gemm/gemm_systolic.py
@@ -0,0 +1,57 @@
+import heterocl as hcl
+import numpy as np
+from itertools import permutations
+import os, sys
+import argparse
+
+def autosa_systolic_array(size):
+    m = size
+    n = size
+    k = size
+
+    dtype=hcl.Float()
+    hcl.init(dtype)
+
+    A = hcl.placeholder((m,k), dtype=dtype, name="A")
+    B = hcl.placeholder((k,n), dtype=dtype, name="B")
+
+    def kernel(A, B):
+        Y = hcl.compute((m, n), lambda *args: 0, dtype=dtype, name="Y0")
+        with hcl.Stage("Y"):
+            with hcl.for_(0, m, name="i") as i:
+                with hcl.for_(0, n, name="j") as j:
+                    Y[i][j] = 0
+                    with hcl.for_(0, k, name="k") as r:
+                        Y[i][j] += A[i][r] * B[r][j]
+        return Y
+
+    # Note that you have to make sure AutoSA binary
+    # in on the PATH by running which command, otherwise HCL runtime
+    # will only generate a function placeholder for the GEMM kernel
+    p = hcl.Platform.xilinx_zc706
+    p.config(compiler="vitis", mode="csyn")
+    
+    s = hcl.create_schedule([A, B], kernel)
+    MM = kernel.Y
+
+    s.to([A, B, kernel.Y0], p.xcel)
+    s.to(kernel.Y.Y0, p.host)
+
+    s[kernel.Y].systolic()
+    s.transpose(kernel.Y.B)
+    s.pack([MM.B, MM.A, MM.Y0], factor=512)
+
+    np_A = np.random.randint(10, size=(m,k))
+    np_B = np.random.randint(10, size=(k,n))
+    np_C = np.zeros((m,n))
+    args = (np_A, np_B, np_C)
+
+    print(hcl.lower(s))
+    f = hcl.build(s, target=p)
+    f(hcl.asarray(np_A), hcl.asarray(np_B), hcl.asarray(np_C))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--size', nargs='?', const=1024, type=int, default=1024)
+    args = parser.parse_args()
+    autosa_systolic_array(args.size)
diff --git a/tests/issues/test_issue_416.py b/tests/issues/test_issue_416.py
index 7556f7bae..960d10a9e 100644
--- a/tests/issues/test_issue_416.py
+++ b/tests/issues/test_issue_416.py
@@ -61,7 +61,7 @@ def update_knn(dist, knn_mat, i, j):
 
     # Inputs/Outputs definition (§4)
     # Scalars (§4.1)
-    test_image = hcl.placeholder((), "test_image")
+    test_image = hcl.placeholder((1,), "test_image")
     # Tensors (§4.2)
     train_images = hcl.placeholder(data_size, "train_images")
 
@@ -106,8 +106,9 @@ def test_code_gen_knn():
     target = hcl.Platform.aws_f1
     target.config(compiler="vitis", backend="vhls", mode="debug")
     code = top(target)
-    assert "buffer_test_image = test_image" in code, code
-
+    for line in code:
+        if "buffer_test_image" in line:
+            assert "sizeof(ap_uint<49>)*1" in line, line
 
 if __name__ == "__main__":
     test_code_gen_knn()
diff --git a/tests/test_cont_integration.py b/tests/test_cont_integration.py
deleted file mode 100644
index 89616c3e0..000000000
--- a/tests/test_cont_integration.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import heterocl as hcl
-import os
-import numpy as np
-
-def test_vivado_hls():
-    def test_hls(target_mode):
-        hcl.init(hcl.Int(16))
-        A = hcl.placeholder((10,), "A")
-        def kernel(A):
-            B = hcl.compute(A.shape, lambda *args : A[args] + 1, "B")
-            return B
-        
-        target = hcl.Platform.aws_f1
-        s = hcl.create_schedule([A], kernel)
-        s.to(A, target.xcel)
-        s.to(kernel.B, target.host)
-        target.config(compiler="vivado_hls", mode=target_mode)
-        f = hcl.build(s, target)
-
-        np_A = np.random.randint(10, size=(10,))
-        np_B = np.zeros((10,))
-
-        hcl_A = hcl.asarray(np_A, dtype=hcl.Int(16))
-        hcl_B = hcl.asarray(np_B, dtype=hcl.Int(16))
-        f(hcl_A, hcl_B)
-        ret_B = hcl_B.asnumpy()
-
-        report = f.report()
-        np.testing.assert_array_equal(ret_B, (np_A+1)*1)
-    
-    if os.getenv("LOCAL_CI_TEST"):
-        test_hls("csim|csyn")
-    else:
-        assert os.getenv("LOCAL_CI_TEST") == None
-
-def test_vitis():
-    def test_hls(target_mode):
-        hcl.init(hcl.Int(16))
-        A = hcl.placeholder((10,), "A")
-        def kernel(A):
-            B = hcl.compute(A.shape, lambda *args : A[args] + 1, "B")
-            return B
-        
-        target = hcl.Platform.aws_f1
-        s = hcl.create_schedule([A], kernel)
-        s.to(A, target.xcel)
-        s.to(kernel.B, target.host)
-        target.config(compiler="vitis", mode=target_mode)
-        f = hcl.build(s, target)
-
-        np_A = np.random.randint(10, size=(10,))
-        np_B = np.zeros((10,))
-
-        hcl_A = hcl.asarray(np_A, dtype=hcl.Int(16))
-        hcl_B = hcl.asarray(np_B, dtype=hcl.Int(16))
-        f(hcl_A, hcl_B)
-        ret_B = hcl_B.asnumpy()
-        np.testing.assert_array_equal(ret_B, (np_A+1)*1)
-
-    if os.getenv("LOCAL_CI_TEST"):
-        test_hls("sw_sim")
-    else:
-        assert os.getenv("LOCAL_CI_TEST") == None
-
-if __name__ == "__main__":
-    test_vivado_hls()
-    test_vitis()
diff --git a/tests/test_continuous_integration.py b/tests/test_continuous_integration.py
new file mode 100644
index 000000000..024ba0990
--- /dev/null
+++ b/tests/test_continuous_integration.py
@@ -0,0 +1,127 @@
+import heterocl as hcl
+import os
+import numpy as np
+
+def test_vivado_hls():
+    def test_hls(target_mode):
+        hcl.init(hcl.Int(16))
+        A = hcl.placeholder((10,), "A")
+        def kernel(A):
+            B = hcl.compute(A.shape, lambda *args : A[args] + 1, "B")
+            return B
+        
+        target = hcl.Platform.aws_f1
+        s = hcl.create_schedule([A], kernel)
+        s.to(A, target.xcel)
+        s.to(kernel.B, target.host)
+        target.config(compiler="vivado_hls", mode=target_mode)
+        f = hcl.build(s, target)
+
+        np_A = np.random.randint(10, size=(10,))
+        np_B = np.zeros((10,))
+
+        hcl_A = hcl.asarray(np_A, dtype=hcl.Int(16))
+        hcl_B = hcl.asarray(np_B, dtype=hcl.Int(16))
+        f(hcl_A, hcl_B)
+        ret_B = hcl_B.asnumpy()
+
+        report = f.report()
+        np.testing.assert_array_equal(ret_B, (np_A+1)*1)
+    
+    if os.getenv("LOCAL_CI_TEST"):
+        test_hls("csim|csyn")
+    else:
+        assert os.getenv("LOCAL_CI_TEST") == None
+
+def test_vitis_sim():
+    def test_hls(target_mode):
+        hcl.init(hcl.Int(16))
+        A = hcl.placeholder((10,), "A")
+        def kernel(A):
+            B = hcl.compute(A.shape, lambda *args : A[args] + 1, "B")
+            return B
+        
+        target = hcl.Platform.aws_f1
+        s = hcl.create_schedule([A], kernel)
+        s.to(A, target.xcel)
+        s.to(kernel.B, target.host)
+        target.config(compiler="vitis", mode=target_mode)
+        f = hcl.build(s, target)
+
+        np_A = np.random.randint(10, size=(10,))
+        np_B = np.zeros((10,))
+
+        hcl_A = hcl.asarray(np_A, dtype=hcl.Int(16))
+        hcl_B = hcl.asarray(np_B, dtype=hcl.Int(16))
+        f(hcl_A, hcl_B)
+        ret_B = hcl_B.asnumpy()
+        np.testing.assert_array_equal(ret_B, (np_A+1)*1)
+
+    if os.getenv("LOCAL_CI_TEST"):
+        test_hls("sw_sim")
+    else:
+        assert os.getenv("LOCAL_CI_TEST") == None
+
+def test_autosa_backend():
+    def test_hls(size, target_mode):
+        m = size
+        n = size
+        k = size
+
+        dtype=hcl.Float()
+        hcl.init(dtype)
+
+        A = hcl.placeholder((m,k), dtype=dtype, name="A")
+        B = hcl.placeholder((k,n), dtype=dtype, name="B")
+
+        def kernel(A, B):
+            Y = hcl.compute((m, n), lambda *args: 0, dtype=dtype, name="Y0")
+            with hcl.Stage("Y"):
+                with hcl.for_(0, m, name="i") as i:
+                    with hcl.for_(0, n, name="j") as j:
+                        Y[i][j] = 0
+                        with hcl.for_(0, k, name="k") as r:
+                            Y[i][j] += A[i][r] * B[r][j]
+            return Y
+
+        p = hcl.Platform.xilinx_zc706
+        p.config(compiler="vitis", mode=target_mode)
+
+        s = hcl.create_schedule([A, B], kernel)
+        MM = kernel.Y
+
+        s.to([A, B, kernel.Y0], p.xcel)
+        s.to(kernel.Y.Y0, p.host)
+
+        # intra-kernel data placement to create systolic araray
+        s[kernel.Y].systolic()
+        # using .to() as alternative to .systolic() for SA generation
+        # PEs = s[kernel.Y].unroll(axis=[0,1], explicit=True)
+        # for r in range(64):
+        #     s.to(PEs[r,0].A, PEs[r,1]).to(PEs[r,2]).to(PEs[r,3])...
+        # for c in range(64):
+        #     s.to(PEs[0,c].B, PEs[1,c]).to(PEs[2,c]).to(PEs[3,c])...        
+
+        s.transpose(kernel.Y.B)
+        s.pack([MM.B, MM.A, MM.Y0], factor=512)
+
+        np_A = np.random.randint(10, size=(m,k))
+        np_B = np.random.randint(10, size=(k,n))
+        np_C = np.zeros((m,n))
+        args = (np_A, np_B, np_C)
+
+        code = hcl.build(s, target=p)
+        assert code.count("PE_wrapper") == 8196, \
+            "Wrong num of PEs in SA: {}".format(code.count("PE_wrapper"))
+
+    if os.getenv("LOCAL_CI_TEST"):
+        if os.getenv("AUTOSA"):
+            assert os.path.exists(os.getenv("AUTOSA")) 
+            test_hls(1024, "debug")
+    else:
+        assert os.getenv("LOCAL_CI_TEST") == None
+
+if __name__ == "__main__":
+    test_vivado_hls()
+    test_vitis_sim()
+    test_autosa_backend()
diff --git a/tvm/HalideIR/src/ir/IR.cpp b/tvm/HalideIR/src/ir/IR.cpp
index 53f8ff89a..ae1e58081 100644
--- a/tvm/HalideIR/src/ir/IR.cpp
+++ b/tvm/HalideIR/src/ir/IR.cpp
@@ -1312,6 +1312,9 @@ Call::ConstString Call::memoize_expr = "memoize_expr";
 Call::ConstString Call::alloca = "alloca";
 Call::ConstString Call::likely = "likely";
 Call::ConstString Call::likely_if_innermost = "likely_if_innermost";
+Call::ConstString Call::transpose = "transpose";
+Call::ConstString Call::serialize = "serialize";
+Call::ConstString Call::deserialize = "deserialize";
 Call::ConstString Call::register_destructor = "register_destructor";
 Call::ConstString Call::div_round_to_zero = "div_round_to_zero";
 Call::ConstString Call::mod_round_to_zero = "mod_round_to_zero";
diff --git a/tvm/HalideIR/src/ir/IR.h b/tvm/HalideIR/src/ir/IR.h
index 3e4004743..2bfd90fb5 100644
--- a/tvm/HalideIR/src/ir/IR.h
+++ b/tvm/HalideIR/src/ir/IR.h
@@ -727,10 +727,11 @@ struct Call : public ExprNode<Call> {
       count_trailing_zeros, undef, return_second, if_then_else,
       glsl_texture_load, glsl_texture_store, glsl_varying, image_load,
       image_store, make_struct, stringify, memoize_expr, alloca, likely,
-      likely_if_innermost, register_destructor, div_round_to_zero,
-      mod_round_to_zero, call_cached_indirect_function, prefetch,
-      signed_integer_overflow, indeterminate_expression, bool_to_mask,
-      cast_mask, select_mask, extract_mask_element, size_of_halideir_buffer_t;
+      likely_if_innermost, transpose, serialize, deserialize,
+      register_destructor, div_round_to_zero, mod_round_to_zero,
+      call_cached_indirect_function, prefetch, signed_integer_overflow,
+      indeterminate_expression, bool_to_mask, cast_mask, select_mask,
+      extract_mask_element, size_of_halideir_buffer_t;
   // If it's a call to another halide function, this call node holds
   // onto a pointer to that function for the purposes of reference
   // counting only. Self-references in update definitions do not
diff --git a/tvm/include/tvm/ir.h b/tvm/include/tvm/ir.h
index 03fdb7d0a..4df14396b 100644
--- a/tvm/include/tvm/ir.h
+++ b/tvm/include/tvm/ir.h
@@ -252,6 +252,8 @@ constexpr const char* bind_scope = "bind_scope";
 
 constexpr const char* stream_scope = "stream_scope";
 constexpr const char* stream_attrs = "stream_attrs";
+// Define the desired tensor layout
+constexpr const char* tensor_layout_attrs = "tensor_layout_attrs";
 
 }  // namespace attr
 
diff --git a/tvm/include/tvm/ir_pass.h b/tvm/include/tvm/ir_pass.h
index 2c2cf700b..f9ceda763 100644
--- a/tvm/include/tvm/ir_pass.h
+++ b/tvm/include/tvm/ir_pass.h
@@ -212,6 +212,7 @@ Stmt RemoveNoOp(Stmt stmt);
  * \return Transformed stmt.
  */
 Stmt InferStream(Stmt stmt, Array<NodeRef> api_args);
+Stmt TransformLayout(Stmt stmt, Array<NodeRef> api_args);
 
 /*!
  * \brief Split statement into pipeine stages.
diff --git a/tvm/include/tvm/schedule.h b/tvm/include/tvm/schedule.h
index df6db04e7..0b1728836 100644
--- a/tvm/include/tvm/schedule.h
+++ b/tvm/include/tvm/schedule.h
@@ -237,6 +237,7 @@ class Stage : public NodeRef {
 
   EXPORT Stage& stencil(int burst_width, int unroll_factor,
                         int num_iteration);  // NOLINT(*)
+  EXPORT Stage& systolic();   // NOLINT(*)                    
   /*!
    * \brief Annotate the iteration with pragma
    *
@@ -338,6 +339,12 @@ class Schedule : public NodeRef {
   EXPORT Tensor reuse_at(const Tensor& target, Stage parent, IterVar axis,
                          std::string name);
 
+  EXPORT Array<Tensor> explicit_unroll(
+    const Tensor& target, const Array<IterVar> axes, bool autosa);
+
+  EXPORT void transform_layout(
+    Stage parent, const Tensor& target, Array<Expr> shape);
+
   EXPORT void to_stage(const Tensor& target, Stage dest, int arg_pos,
                        ir::StreamType stream_type, int channel_depth,
                        std::string name);
diff --git a/tvm/src/api/api_lang.cc b/tvm/src/api/api_lang.cc
index c8c67fa59..9c637e7fe 100644
--- a/tvm/src/api/api_lang.cc
+++ b/tvm/src/api/api_lang.cc
@@ -301,6 +301,10 @@ TVM_REGISTER_API("_StageStencil").set_body([](TVMArgs args, TVMRetValue* ret) {
   args[0].operator Stage().stencil(args[1], args[2], args[3]);
 });
 
+TVM_REGISTER_API("_StageSystolic").set_body([](TVMArgs args, TVMRetValue* ret) {
+  args[0].operator Stage().systolic();
+});
+
 TVM_REGISTER_API("_StagePragma").set_body([](TVMArgs args, TVMRetValue* ret) {
   args[0].operator Stage().pragma(args[1], args[2]);
 });
@@ -347,6 +351,12 @@ TVM_REGISTER_API("_SchedulePartition")
           static_cast<ir::PartitionType>(args[4].operator int()));
     });
 
+TVM_REGISTER_API("_TransformLayout")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+    args[0].operator Schedule()
+        .transform_layout(args[1], args[2], args[3]);
+  });
+
 TVM_REGISTER_API("_ScheduleMoveToStage")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       args[0].operator Schedule().to_stage(
diff --git a/tvm/src/api/api_pass.cc b/tvm/src/api/api_pass.cc
index f5a94990d..0b5177725 100644
--- a/tvm/src/api/api_pass.cc
+++ b/tvm/src/api/api_pass.cc
@@ -130,6 +130,7 @@ REGISTER_PASS2(InjectDoubleBuffer);
 REGISTER_PASS2(LoopPartition);
 REGISTER_PASS1(RemoveNoOp);
 REGISTER_PASS2(InferStream);
+REGISTER_PASS2(TransformLayout);
 REGISTER_PASS2(SplitPipeline);
 REGISTER_PASS2(LiftAttrScope);
 REGISTER_PASS1(NarrowChannelAccess);
diff --git a/tvm/src/codegen/build_util.cc b/tvm/src/codegen/build_util.cc
index 7fdd2fb88..99d8e399b 100644
--- a/tvm/src/codegen/build_util.cc
+++ b/tvm/src/codegen/build_util.cc
@@ -368,7 +368,7 @@ void PrintCopyBack(TVMArray* arr, std::vector<std::string> arg_names,
   }
 }
 
-// generate kernel code into files
+// Generate kernel code into files
 void GenKernelCode(std::string& test_file, std::vector<std::string> arg_names,
                    std::string platform, std::string backend,
                    std::string project) {
@@ -380,28 +380,27 @@ void GenKernelCode(std::string& test_file, std::vector<std::string> arg_names,
   if (platform == "aocl") kernel_ext = "cl";
   stream.open(project + "/kernel." + kernel_ext);
 
-  // generate hash
+  // Generate hash for source kernel file
   std::hash<std::string> hasher;
   stream << "// HASH:" << ((size_t)hasher(test_file) & 0xFFFFFFFF) << "\n";
 
-  // create typedef and header
+  // Create typedef and header
   if (platform == "vivado_hls" || platform == "sdsoc") {
-    // add header file to host code
+    // Add header file to host code
     auto pos = test_file.rfind("#include ");
     auto next = test_file.find('\n', pos);
     test_file.insert(next + 1, "#include \"kernel.h\"\n");
 
-    // create typedef list
+    // Create typedef list
     std::unordered_map<std::string, std::string> typedef_map(
         {{"ap_uint<32>", "ubit32"}, {"ap_int<32>", "bit32"}});
-
     for (auto& kv : typedef_map) {
       while (test_file.find(kv.first) != std::string::npos)
         test_file.replace(test_file.find(kv.first), kv.first.length(),
                           kv.second);
     }
 
-    // generate header file
+    // Generate header file
     std::ofstream header;
     header.open(project + "/kernel.h");
     header << "#ifndef __KERNEL_H__\n"
@@ -413,14 +412,14 @@ void GenKernelCode(std::string& test_file, std::vector<std::string> arg_names,
       header << "typedef " << kv.first << " " << kv.second << ";\n";
     }
 
-    // locate top function
+    // Locate top function
     CHECK(test_file.find("test(") != std::string::npos)
         << "cannot find top function";
     size_t dut = test_file.find("test(");
     size_t begin = test_file.rfind('\n', dut);
     size_t end = test_file.find(')', dut) + 1;
 
-    // TODO(hecmay): better way to specify prgamas
+    // TODO(hecmay): better way to specify pragmas
     if (platform == "sdsoc") {
       // TODO(hecmay): direct memory interface with PL and DDR
       header << "#pragma SDS data copy(";
diff --git a/tvm/src/codegen/codegen_c.cc b/tvm/src/codegen/codegen_c.cc
index 710a3da50..9257717f3 100644
--- a/tvm/src/codegen/codegen_c.cc
+++ b/tvm/src/codegen/codegen_c.cc
@@ -860,6 +860,12 @@ void CodeGenC::VisitExpr_(const Call* op, std::ostream& os) {  // NOLINT(*)
     os << "(";
     this->PrintExpr(op->args[0], os);
     os << " == NULL)";
+  } else if (op->is_intrinsic(Call::transpose)) {
+    LOG(WARNING) << "Intrinsic transpose not implemented yet";
+  } else if (op->is_intrinsic(Call::serialize)) {
+    LOG(WARNING) << "Intrinsic serialize not implemented yet";
+  } else if (op->is_intrinsic(Call::deserialize)) {
+    LOG(WARNING) << "Intrinsic deserialize not implemented yet";
   } else {
     if (op->call_type == Call::Intrinsic ||
         op->call_type == Call::PureIntrinsic) {
diff --git a/tvm/src/codegen/opencl/codegen_xocl_host.cc b/tvm/src/codegen/opencl/codegen_xocl_host.cc
index 65a84d1b7..4880c8648 100644
--- a/tvm/src/codegen/opencl/codegen_xocl_host.cc
+++ b/tvm/src/codegen/opencl/codegen_xocl_host.cc
@@ -215,6 +215,66 @@ void CodeGenXOCLHost::VisitStmt_(const Allocate* op) {
   this->PrintStmt(op->body);
 }
 
+void CodeGenXOCLHost::VisitExpr_(const Call* op,
+                                 std::ostream& os) {  // NOLINT(*)
+  if (op->is_intrinsic(Call::transpose)) {
+    CHECK_EQ(op->args.size(), 3);
+    decl_stream << "#include <algorithm>\n";
+    decl_stream << R"(
+template<class RandomIterator>
+void transpose(RandomIterator first, RandomIterator last, int m)
+{
+    const int mn1 = (last - first - 1);
+    const int n   = (last - first) / m;
+    std::vector<bool> visited(last - first);
+    RandomIterator cycle = first;
+    while (++cycle != last) {
+        if (visited[cycle - first])
+            continue;
+        int a = cycle - first;
+        do  {
+            a = a == mn1 ? mn1 : (n * a) % mn1;
+            std::swap(*(first + a), *cycle);
+            visited[a] = true;
+        } while ((first + a) != cycle);
+    }
+}
+)";
+
+    // Expected output: transpose(B, B+size, dim0)
+    os << "transpose(";
+    this->PrintExpr(op->args[0], os);
+    os << ".begin(), ";
+    this->PrintExpr(op->args[0], os);
+    os << ".end(), " << op->args[2] << ")";
+
+  } else if (op->is_intrinsic(Call::serialize)) {
+    // Expected serilization in host program
+    //    std::vector<float, aligned_allocator<float>> dev_A(SIZE);
+    //    host_serialize_A(dev_A, A);
+    CHECK_EQ(op->args.size(), 2);
+    auto ptr = op->args[0].as<StringImm>();
+    auto name = ptr->value;
+    auto type = op->args[1].as<StringImm>()->value;
+    // Create an align allocator for device memory
+    // Since the seriliazation buffer size depends on the access pattern
+    // and is decided by AutoSA. Here we just leave a placeholder and
+    // leave to code post-processing to substitute it
+    os << "host_serialize_" << name << "(" << name << "_dev_ser.data(), "
+       << name << ".data())";
+
+  } else if (op->is_intrinsic(Call::deserialize)) {
+    CHECK_EQ(op->args.size(), 2);
+    auto ptr = op->args[0].as<StringImm>();
+    auto name = ptr->value;
+    os << "host_deserialize_" << name << "(" << name << ".data(), " << name
+       << "_dev_deser.data())";
+
+  } else {
+    CodeGenC::VisitExpr_(op, os);
+  }
+}
+
 void CodeGenXOCLHost::VisitStmt_(const KernelStmt* op) {
   using TVM::ir::IoInfo;
   std::string name = op->name;
diff --git a/tvm/src/codegen/opencl/codegen_xocl_host.h b/tvm/src/codegen/opencl/codegen_xocl_host.h
index 00d855fe1..025f864e6 100644
--- a/tvm/src/codegen/opencl/codegen_xocl_host.h
+++ b/tvm/src/codegen/opencl/codegen_xocl_host.h
@@ -21,6 +21,7 @@ class CodeGenXOCLHost : public CodeGenC {
 
   void VisitExpr_(const Min* op, std::ostream& os) override;
   void VisitExpr_(const Max* op, std::ostream& os) override;
+  void VisitExpr_(const Call* op, std::ostream& os) override;
 
   void VisitStmt_(const For* op) override;
   void VisitStmt_(const IfThenElse* op) override;
diff --git a/tvm/src/pass/adjust_buffer_binding.cc b/tvm/src/pass/adjust_buffer_binding.cc
index 7b3992441..0b22ef64e 100644
--- a/tvm/src/pass/adjust_buffer_binding.cc
+++ b/tvm/src/pass/adjust_buffer_binding.cc
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2019 by Contributors
- * \file adjust_buffer_binding.cc
+ * \file loop_partition.cc
  */
 #include <arithmetic/Substitute.h>
 #include <tvm/ir.h>
@@ -33,6 +33,42 @@ class BufferBindingAdjuster final : public IRMutator {
     return IRMutator::Mutate_(op, s);
   }
 
+  Stmt Mutate_(const Stencil* op, const Stmt& s) {
+    Array<VarExpr> new_inputs;
+    Array<VarExpr> new_outputs;
+    for (auto& e : op->inputs) {
+      if (HandleUse(e)) {
+        HCL_DEBUG_LEVEL(2) << "Undefined Stencil input: " << e;
+        CHECK(e.as<Variable>());
+        auto name = e.as<Variable>()->name_hint;
+        CHECK(name_var_map_.count(name)) << name;
+        VarExpr new_buf(name_var_map_[name].node_);
+        new_inputs.push_back(new_buf);
+      } else {
+        new_inputs.push_back(e);
+      }
+    }
+
+    for (auto& e : op->outputs) {
+      if (HandleUse(e)) {
+        HCL_DEBUG_LEVEL(2) << "Undefined Stencil output: " << e;
+        CHECK(e.as<Variable>());
+        auto name = e.as<Variable>()->name_hint;
+        if (name_var_map_.count(name)) {
+          VarExpr new_buf(name_var_map_[name].node_);
+          new_outputs.push_back(new_buf);
+        } else {
+          new_outputs.push_back(e);
+        }
+      } else {
+        new_outputs.push_back(e);
+      }
+    }
+    Stmt body = this->Mutate(op->body);
+    return Stencil::make(new_inputs, new_outputs, body, op->burst_width,
+                         op->unroll_factor, op->num_iteration);
+  }
+
   Expr Mutate_(const Let* op, const Expr& e) {
     HandleDef(op->var);
     return this->Mutate(op->body);
@@ -117,15 +153,24 @@ class BufferBindingAdjuster final : public IRMutator {
     return IRMutator::Mutate_(op, s);
   }
 
-  Expr Mutate_(const Variable* op, const Expr& e) {
-    if (HandleUse(e)) {
-      HCL_DEBUG_LEVEL(2) << "Undefined Variable buffer: " << e;
-      auto buffer_name = op->name_hint;
-      CHECK(name_var_map_.count(buffer_name)) << buffer_name;
-      VarExpr new_buf(name_var_map_[buffer_name].node_);
-      return new_buf;
+  Expr Mutate_(const Call* op, const Expr& e) {
+    if (op->is_intrinsic(Call::transpose)) {
+      CHECK_EQ(op->args.size(), 3);
+      if (HandleUse(op->args[0])) {
+        auto var = op->args[0].as<Variable>();
+        CHECK(var);
+        HCL_DEBUG_LEVEL(2) << "Undefined instrinsic buffer: " << e;
+        auto buffer_name = var->name_hint;
+        CHECK(name_var_map_.count(buffer_name)) << buffer_name;
+        VarExpr new_buf(name_var_map_[buffer_name].node_);
+        return Call::make(Int(32), "transpose",
+                          {new_buf, op->args[1], op->args[2]}, Call::Intrinsic);
+      } else {
+        return IRMutator::Mutate_(op, e);
+      }
+    } else {
+      return IRMutator::Mutate_(op, e);
     }
-    return IRMutator::Mutate_(op, e);
   }
 
   Stmt Mutate_(const Partition* op, const Stmt& s) {
@@ -166,6 +211,13 @@ class BufferBindingAdjuster final : public IRMutator {
     return IRMutator::Mutate_(op, e);
   }
 
+  Expr Mutate_(const Variable* op, const Expr& e) {
+    if (HandleUse(e)) {
+      HCL_DEBUG_LEVEL(2) << "Undefined Variable buffer: " << e;
+    }
+    return IRMutator::Mutate_(op, e);
+  }
+
   Expr Mutate_(const StreamExpr* op, const Expr& e) {
     if (HandleUse(op->buffer_var)) {
       HCL_DEBUG_LEVEL(2) << "Undefined StreamExpr buffer: " << e;
@@ -245,18 +297,13 @@ Stmt AdjustBufferBinding(Stmt stmt, Array<NodeRef> arg_list) {
       shape_map[node->data.get()] = node->shape;
       input_args.push_back(node->data);
       buffer_map[node->data.get()] = node->data;
-    } else {
-      const Variable* v = arg_list[i].as<Variable>();
-      CHECK(v) << "Illegal argument " << arg_list[i];
-      Var input_var(arg_list[i].node_);
-      shape_map[v] = {1};
-      input_args.push_back(input_var);
-      buffer_map[v] = input_var;
     }
   }
   Array<Var> undefined = UndefinedVars(stmt, input_args);
   if (undefined.size() > 0) {
     HCL_DEBUG_LEVEL(2) << "Fonud mismatching buffers in the stmt...";
+    HCL_DEBUG_LEVEL(2) << "----------------- stmt -----------------";
+    HCL_DEBUG_LEVEL(2) << stmt;
     for (auto& v : undefined) {
       HCL_DEBUG_LEVEL(2) << "    " << v << "(" << v.get() << ")";
     }
diff --git a/tvm/src/pass/stream_inference.cc b/tvm/src/pass/stream_inference.cc
index b8dd568a9..6b4e4a725 100644
--- a/tvm/src/pass/stream_inference.cc
+++ b/tvm/src/pass/stream_inference.cc
@@ -1834,8 +1834,23 @@ class FifoAccessChecker final : public IRMutator {
   std::map<const Variable*, Expr> min_map_;
 };
 
+// Collect loop nest loop bound information
+class CollectLoopNestBound final : public IRMutator {
+ public:
+  vector<Expr> bounds;
+  Stmt Mutate_(const For* op, const Stmt& s) {
+    bounds.push_back(Simplify(op->extent));
+    Stmt stmt = this->Mutate(op->body);
+    return For::make(op->loop_var, op->min, op->extent, op->for_type,
+                     op->device_api, stmt, op->annotate_keys,
+                     op->annotate_values);
+  }
+};
+
 class ExternModuleFormater final : public IRMutator {
  public:
+  ExternModuleFormater(unordered_set<string> top_arg_names)
+      : top_arg_names_(top_arg_names) {}
   // Collect information of streamed module args
   Stmt Mutate_(const ExternModule* op, const Stmt& s) {
     if (collect_info) {
@@ -1880,6 +1895,59 @@ class ExternModuleFormater final : public IRMutator {
     } else {
       CHECK(port_types_map.count(op));
       CHECK(arg_names_map.count(op));
+
+      // Collect and inject loop information into ExternMod node
+      if (op->attr_key == "autosa") {
+        Expr value = this->Mutate(op->value);
+        Stmt body = this->Mutate(op->body);
+        auto annotate_keys = op->annotate_keys;
+        auto annotate_values = op->annotate_values;
+
+        // Collect loop bound information
+        CollectLoopNestBound collector;
+        collector.Mutate(op->body);
+        annotate_keys.push_back(StringImm::make("loop_bound"));
+        std::string bound_info;
+        std::string delim = "";
+        for (auto& e : collector.bounds) {
+          CHECK(e.as<IntImm>()) << e;
+          bound_info += delim + std::to_string(e.as<IntImm>()->value);
+          delim = ",";
+        }
+        annotate_values.push_back(StringImm::make(bound_info));
+
+        // Collect input tensor placement and read/write information
+        Array<Var> input_vars = UndefinedVars(body, Array<Var>());
+        Array<VarExpr> input_args;
+        for (auto& v : input_vars) {
+          input_args.push_back(v);
+        }
+        InputDirectionCollector idc(input_args);
+        auto is_arg_written = idc.Analyze(body);
+
+        annotate_keys.push_back(StringImm::make("tensor_placement"));
+        delim = "";
+        std::string placement_info;
+        for (auto& var : input_vars) {
+          string var_name = var.get()->name_hint;
+          placement_info += delim + var_name;
+          if (top_arg_names_.find(var_name) != top_arg_names_.end()) {
+            placement_info += "[0]";  // located on off-chip memory
+          } else {
+            placement_info += "[1]";  // loacted on on-chip memory
+          }
+          CHECK(is_arg_written.count(var_name)) << var_name;
+          if (is_arg_written.at(var_name)) {
+            placement_info += "[write]";
+          } else {
+            placement_info += "[read]";
+          }
+          delim = ",";
+        }
+        annotate_values.push_back(StringImm::make(placement_info));
+        return ExternModule::make(op->attr_key, value, body, annotate_keys,
+                                  annotate_values);
+      }
     }
 
     Stmt stmt = IRMutator::Mutate_(op, s);
@@ -1924,6 +1992,7 @@ class ExternModuleFormater final : public IRMutator {
     return Mutate(stmt);
   }
 
+  unordered_set<string> top_arg_names_;
   bool collect_info{false};
   unordered_map<const ExternModule*, vector<int>> port_types_map;
   unordered_map<const ExternModule*, vector<string>> arg_names_map;
@@ -2453,7 +2522,7 @@ Stmt InferStream(Stmt stmt, Array<NodeRef> api_args) {
 
   // Check the Extern Module
   // Convert streaming FIFOs into StreamAlloc
-  ExternModuleFormater emf;
+  ExternModuleFormater emf(sic.top_arg_names);
   stmt = emf.Format(stmt);
 
   // Handle self loopback streaming channels
diff --git a/tvm/src/pass/transform_layout.cc b/tvm/src/pass/transform_layout.cc
new file mode 100644
index 000000000..6f9bc800c
--- /dev/null
+++ b/tvm/src/pass/transform_layout.cc
@@ -0,0 +1,866 @@
+/*!
+ *  Copyright (c) 2021 by Contributors
+ *  Restore task graph and tranform layout.
+ */
+// Transform the tensor layout based on annotation
+#include <tvm/ir.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_visitor.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "../arithmetic/compute_expr.h"
+
+namespace TVM {
+namespace ir {
+
+using std::string;
+using std::unordered_map;
+using std::unordered_set;
+using std::vector;
+
+struct TaskNode {
+  string name;
+  // Tensor being updated in the task
+  unordered_set<const Variable*> updated_tensors;
+  // Input tensors needed by the task
+  unordered_set<const Variable*> input_tensors;
+  // Children tasks name
+  unordered_set<string> children;
+  // Parent tasks name
+  unordered_set<string> parents;
+};
+
+struct TransformInfo {
+  string name;
+  VarExpr var;
+  string anchor_producer;
+  Array<Expr> origin_shape;
+  Array<Expr> target_shape;
+  Type origin_type;
+  Type type;
+  bool is_transpose;
+  bool is_pack;
+  int pack_factor;
+  bool is_written;
+};
+
+class TensorSubstitution final : public IRMutator {
+ public:
+  TensorSubstitution(unordered_map<const Variable*, Expr>& vmap)
+      : vmap_(vmap) {}
+
+  Stmt Mutate_(const KernelStmt* op, const Stmt& s) final {
+    Array<Expr> new_args;
+    for (auto& e : op->args) {
+      auto ptr = e.as<Variable>();
+      CHECK(ptr) << e;
+      bool is_found = false;
+      Expr new_buf;
+      for (auto& kv : vmap_) {
+        if (kv.first->name_hint == ptr->name_hint) {
+          HCL_DEBUG_LEVEL(2) << "  -- [substitute] " << ptr->name_hint
+                             << " in kernel " << op->name;
+          is_found = true;
+          new_buf = Expr(kv.second.node_);
+        }
+      }
+      if (is_found) {
+        CHECK(new_buf.defined());
+        new_args.push_back(new_buf);
+      } else {
+        new_args.push_back(e);
+      }
+    }
+    return KernelStmt::make(new_args, op->name, op->annotate_keys,
+                            op->annotate_values);
+  }
+  unordered_map<const Variable*, Expr>& vmap_;
+};
+
+Stmt SubstituteTensor(Stmt s, unordered_map<const Variable*, Expr> vmap) {
+  return TensorSubstitution(vmap).Mutate(s);
+}
+
+// Return string repr of type
+string Type2Str(Type type) {
+  string str = "int";
+  if (type.code() == Type::Float) {
+    str = "float";
+  } else if (type.code() == Type::Int) {
+    str = "int";
+  } else if (type.code() == Type::UInt) {
+    str = "uint";
+  }
+  return str + std::to_string(type.bits());
+}
+
+class TransformedBufferInserter final : public IRMutator {
+ public:
+  TransformedBufferInserter(std::string target_producer, TransformInfo& info)
+      : target_producer_(target_producer), info_(info) {}
+
+  // Insert buffer before the producer stage
+  Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) {
+    if (op->is_producer) {
+      std::string name = op->func->func_name();
+      if (name == target_producer_) {
+        Stmt body = this->Mutate(op->body);
+        HCL_DEBUG_LEVEL(2) << "[ debug ] insert layout transformation before "
+                           << name;
+        VarExpr var(info_.name + ".new");
+        VarExpr old_var(info_.var.node_);
+        Type type = info_.type;
+        Array<Expr> origin_shape = info_.origin_shape;
+
+        std::string dtype;
+        if (info_.origin_type.code() == Type::Int) {
+          dtype = "int";
+        } else if (info_.origin_type.code() == Type::UInt) {
+          dtype = "uint";
+        } else if (info_.origin_type.code() == Type::Float) {
+          dtype = "float";
+        }
+
+        // For packed-only var on interface, since the passed into memory
+        // is stored in major fashion continuously, so the data is automatically
+        // packed already
+        if (info_.is_pack && !info_.is_transpose) {
+          // Insert serilization intrisic for AutoSA
+          if (!info_.is_written) {
+            // Insert allocation dev_ser_tensor
+            VarExpr new_var(info_.name + ".dev.ser");
+            unordered_map<const Variable*, Expr> vmap;
+            vmap[info_.var.get()] = new_var;
+
+            // resize the serialized buffer since if may have replicates
+            // due to certain access pattern
+            body = SubstituteTensor(body, vmap);
+            body = ProducerConsumer::make(op->func, op->is_producer, body);
+
+            Stmt serialize = Evaluate::make(Call::make(
+                Int(32), "serialize", {info_.name, dtype}, Call::Intrinsic));
+            body = Block::make(serialize, body);
+
+            body =
+                Allocate::make(new_var, info_.origin_type, info_.origin_shape,
+                               make_const(Bool(type.lanes()), true), body);
+            body = AttrStmt::make(new_var, attr::storage_scope,
+                                  StringImm::make("global"), body);
+            return body;
+
+          } else {
+            VarExpr new_var(info_.name + ".dev.deser");
+            unordered_map<const Variable*, Expr> vmap;
+            vmap[info_.var.get()] = new_var;
+            body = SubstituteTensor(body, vmap);
+            body = ProducerConsumer::make(op->func, op->is_producer, body);
+
+            Stmt deserialize = Evaluate::make(Call::make(
+                Int(32), "deserialize", {info_.name, dtype}, Call::Intrinsic));
+            body = ProducerConsumer::make(op->func, op->is_producer, body);
+            body = Block::make(body, deserialize);
+
+            body =
+                Allocate::make(new_var, info_.origin_type, info_.origin_shape,
+                               make_const(Bool(type.lanes()), true), body);
+            body = AttrStmt::make(new_var, attr::storage_scope,
+                                  StringImm::make("global"), body);
+            return body;
+          }
+
+          // Insert an instrinsic to do in-place matrix tranposition
+        } else if (info_.is_transpose) {
+          int size = 1;
+          for (auto& dim : origin_shape) {
+            auto ptr = dim.as<IntImm>();
+            CHECK(ptr);
+            size *= ptr->value;
+          }
+
+          VarExpr new_var(info_.name + ".dev.ser");
+          unordered_map<const Variable*, Expr> vmap;
+          vmap[info_.var.get()] = new_var;
+          body = SubstituteTensor(body, vmap);
+          body = ProducerConsumer::make(op->func, op->is_producer, body);
+
+          Stmt serialize = Evaluate::make(Call::make(
+              Int(32), "serialize", {info_.name, dtype}, Call::Intrinsic));
+          body = Block::make(serialize, body);
+
+          body = Allocate::make(new_var, info_.origin_type, info_.origin_shape,
+                                make_const(Bool(type.lanes()), true), body);
+          body = AttrStmt::make(new_var, attr::storage_scope,
+                                StringImm::make("global"), body);
+
+          // In-place matrix transposition
+          Stmt trans = Evaluate::make(
+              Call::make(Int(32), "transpose", {old_var, size, origin_shape[0]},
+                         Call::Intrinsic));
+          body = Block::make(trans, body);
+          return body;
+
+          // Insert reshaping logic explicitly
+        } else {
+          // Substitute buffer
+          unordered_map<const Variable*, Expr> vmap;
+          vmap[info_.var.get()] = var;
+          body = SubstituteTensor(body, vmap);
+          HCL_DEBUG_LEVEL(2) << "------------- Substitue ---------";
+          HCL_DEBUG_LEVEL(2) << "  from " << info_.var << " to " << var;
+          HCL_DEBUG_LEVEL(2) << "Inside body: " << body;
+        }
+
+        // Insert pack-only loop
+        if (info_.is_pack) {
+          std::vector<Expr> indices, new_indices;
+          std::vector<VarExpr> loop_vars;
+          std::unordered_map<const Variable*, Expr> range_;
+          for (size_t i = 0; i < origin_shape.size(); i++) {
+            VarExpr iter(name + ".pack.r" + std::to_string(i));
+            indices.push_back(iter);
+            new_indices.push_back(iter);
+            loop_vars.push_back(iter);
+            range_[iter.get()] = Simplify(origin_shape[i] - 1);
+          }
+          // Dim for data packing
+          VarExpr iter(name + ".pack.r");
+          indices.push_back(iter);
+          loop_vars.push_back(iter);
+
+          // Expected output IR (example 512-packing)
+          // for i (0, 64)
+          //   for j (0, 4)
+          //     A.new[i,j] = 0
+          //     for p (0, 16)
+          //        A.new[i,j](32*p+32, 32*p) = A[i,j*16+p]
+          Array<Expr> pack_shape = info_.target_shape;
+          pack_shape.push_back(info_.pack_factor);
+          Expr pack_index = FlattenIndices(indices, pack_shape);
+          Expr new_index = FlattenIndices(new_indices, info_.target_shape);
+
+          // Pack + tranpose
+          // Expected output IR (example 512-packing)
+          // for i (0, 64)
+          //   for j (0, 4)
+          //     A.new[i,j] = 0
+          //     for p (0, 16)
+          //        A.new[i,j](32*p+32, 32*p) = A[j*16+p,i]
+          if (info_.is_transpose) {
+            // Move last two iters to the front (i,(j,p)) to ((j,p),i). Left
+            // shifting
+            std::vector<Expr> transpose_indices = {indices[1], indices[2],
+                                                   indices[0]};
+            pack_index = FlattenIndices(transpose_indices, pack_shape);
+          }
+          Expr load =
+              Load::make(type, old_var, pack_index, UIntImm::make(UInt(1), 1));
+          Expr slice =
+              SetSlice::make(var, load, (1 + iter) * info_.pack_factor - 1,
+                             iter * info_.pack_factor);
+          Stmt for_stmt =
+              Store::make(var, slice, new_index, UIntImm::make(UInt(1), 1));
+
+          auto for_type = ForType::Serial;
+          int bound = pack_shape.size();
+          for (int j = bound - 1; j >= 0; j--) {
+            auto iter = loop_vars[j];
+            for_stmt = For::make(VarExpr(iter.node_), 0, pack_shape[j],
+                                 for_type, DeviceAPI::None, for_stmt);
+            // Insert initialization store
+            if (j == bound - 1) {
+              Stmt init =
+                  Store::make(var, 0, new_index, UIntImm::make(UInt(1), 1));
+              for_stmt = Block::make(init, for_stmt);
+            }
+          }
+          body = Block::make(for_stmt, body);
+
+          // Tensor transpose only
+        } else {
+          std::vector<Expr> indices;
+          std::vector<Expr> reverse_indices;
+          std::vector<VarExpr> loop_vars;
+          for (size_t i = 0; i < origin_shape.size(); i++) {
+            VarExpr iter(name + ".transpose.r" + std::to_string(i));
+            indices.push_back(iter);
+            reverse_indices.insert(reverse_indices.begin(), iter);
+            loop_vars.push_back(iter);
+          }
+          Expr reverse_index = FlattenIndices(reverse_indices, origin_shape);
+          Expr index = FlattenIndices(indices, origin_shape);
+          Expr load =
+              Load::make(type, old_var, index, UIntImm::make(UInt(1), 1));
+          Stmt for_stmt =
+              Store::make(var, load, reverse_index, UIntImm::make(UInt(1), 1));
+
+          auto for_type = ForType::Serial;
+          for (size_t j = 0; j < origin_shape.size(); j++) {
+            auto iter = loop_vars[j];
+            for_stmt = For::make(VarExpr(iter.node_), 0, origin_shape[j],
+                                 for_type, DeviceAPI::None, for_stmt);
+          }
+          body = Block::make(for_stmt, body);
+          HCL_DEBUG_LEVEL(2) << "[  debug  ] tranpose loop for " << var;
+          HCL_DEBUG_LEVEL(2) << for_stmt;
+        }
+
+        body = Allocate::make(var, type, info_.target_shape,
+                              make_const(Bool(type.lanes()), true), body);
+        body = AttrStmt::make(var, attr::storage_scope,
+                              StringImm::make("global"), body);
+        return ProducerConsumer::make(op->func, op->is_producer, body);
+      }
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+  std::string target_producer_;
+  TransformInfo& info_;
+};
+
+class IndicesTransformer final : public IRMutator {
+ public:
+  IndicesTransformer(std::unordered_map<const Variable*, Expr>& range,
+                     std::vector<VarExpr>& loop_iter_vars, TransformInfo& info)
+      : range_(range), loop_iter_vars_(loop_iter_vars), info_(info) {}
+
+  // For AutoSA backend. Just inject the information without
+  // changing the IR
+  Stmt Mutate_(const ExternModule* op, const Stmt& s) {
+    has_autosa_module = true;
+    Expr value = this->Mutate(op->value);
+    Stmt body = this->Mutate(op->body);
+    auto annotate_keys = op->annotate_keys;
+    auto annotate_values = op->annotate_values;
+
+    annotate_keys.push_back(StringImm::make(info_.name));
+    string attr = info_.is_transpose ? "1" : "0";
+    attr += "," + std::to_string(info_.pack_factor);
+    annotate_values.push_back(StringImm::make(attr));
+
+    return ExternModule::make(op->attr_key, value, body, annotate_keys,
+                              annotate_values);
+  }
+
+  // Mutate the function argument
+  Stmt Mutate_(const KernelDef* op, const Stmt& s) override {
+    has_autosa_module = false;
+    Stmt body = this->Mutate(op->body);
+    Array<VarExpr> args;
+    Array<Array<Expr>> arg_shapes;
+    Array<Expr> arg_types;
+
+    for (size_t k = 0; k < op->args.size(); k++) {
+      auto name = op->args[k].get()->name_hint;
+      if (name == info_.name && !has_autosa_module) {
+        // Create arg with same node
+        VarExpr new_var(info_.name, info_.type);
+        args.push_back(new_var);
+        arg_shapes.push_back(info_.target_shape);
+        string type = Type2Str(info_.type);
+        arg_types.push_back(StringImm::make(type));
+      } else {
+        args.push_back(op->args[k]);
+        arg_shapes.push_back(op->arg_shapes[k]);
+        arg_types.push_back(op->arg_types[k]);
+      }
+    }
+    has_autosa_module = false;
+    return KernelDef::make(args, arg_shapes, arg_types, op->arg_tensors, body,
+                           op->ret_void, op->ret_type, op->name,
+                           op->attributes);
+  }
+
+  // Collect for loop information
+  Stmt Mutate_(const For* op, const Stmt& s) override {
+    range_[op->loop_var.get()] = Simplify(op->extent - 1);
+    loop_iter_vars_.push_back(op->loop_var);
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    return stmt;
+  }
+
+  Stmt Mutate_(const Store* op, const Stmt& s) {
+    string target_tensor_name_ = info_.name;
+    Array<Expr> shape_ = info_.origin_shape;
+    if (target_tensor_name_ == op->buffer_var.get()->name_hint) {
+      info_.is_written = true;
+      if (info_.is_transpose) {
+        auto indices = ExtractIndices(op->index, shape_, range_);
+        std::reverse(indices.begin(), indices.end());
+        auto new_index = FlattenIndices(indices, shape_);
+        return Store::make(op->buffer_var, op->value, new_index, op->predicate);
+      }
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Expr Mutate_(const Load* op, const Expr& e) {
+    string target_tensor_name_ = info_.name;
+    Array<Expr> shape_ = info_.origin_shape;
+
+    if (info_.is_transpose) {
+      if (target_tensor_name_ == op->buffer_var.get()->name_hint) {
+        auto indices = ExtractIndices(op->index, shape_, range_);
+        std::reverse(indices.begin(), indices.end());
+        auto new_index = FlattenIndices(indices, shape_);
+        return Load::make(op->type, op->buffer_var, new_index, op->predicate);
+      }
+    }
+    return IRMutator::Mutate_(op, e);
+  }
+
+  std::unordered_map<const Variable*, Expr>& range_;
+  std::vector<VarExpr>& loop_iter_vars_;
+  TransformInfo& info_;
+  bool has_autosa_module{false};
+};
+
+// Insert new buffer before anchor (producer) stage
+Stmt InsertReshapeBuffer(Stmt s, TransformInfo& info,
+                         unordered_map<string, TaskNode>& task_map_,
+                         vector<string> kernel_input_names) {
+  string producer = info.anchor_producer;
+  string tensor_name = info.name;
+
+  CHECK(task_map_.count(producer));
+  bool is_top_arg = false;
+  int arg_index = 0;
+  for (auto v : kernel_input_names) {
+    if (v == tensor_name) {
+      is_top_arg = true;
+      break;
+    }
+    arg_index++;
+  }
+
+  // TODO(hecmay): handles on-chip data packing as well
+  if (is_top_arg) {
+    HCL_DEBUG_LEVEL(2) << "    [ debug ] tensor " << tensor_name
+                       << " is on top function interface";
+    string target_producer = "test";
+    TransformedBufferInserter tbi(target_producer, info);
+    return tbi.Mutate(s);
+  }
+  return s;
+}
+
+// Update the buffer indices. If we want to
+// tranpose, then reverse. Otherwise insert
+// unpacking logic by default
+Stmt UpdateBufferLayout(Stmt s, TransformInfo& info,
+                        unordered_map<string, TaskNode>& task_map_,
+                        vector<string> kernel_input_names) {
+  string producer = info.anchor_producer;
+  string tensor_name = info.name;
+  Stmt stmt = s;
+
+  CHECK(task_map_.count(producer));
+  bool is_top_arg = false;
+  int arg_index = 0;
+  for (auto v : kernel_input_names) {
+    if (v == tensor_name) {
+      is_top_arg = true;
+      break;
+    }
+    arg_index++;
+  }
+
+  // Update buffer access indices and kernel
+  // function signature as well
+  if (is_top_arg) {
+    std::unordered_map<const Variable*, Expr> range_;
+    std::vector<VarExpr> loop_iter_vars_;
+    IndicesTransformer ivc(range_, loop_iter_vars_, info);
+    stmt = ivc.Mutate(stmt);
+  } else {
+  }
+  return stmt;
+}
+
+// Collect tensor type and shape information
+class TypeShapeCollector final : public IRMutator {
+ public:
+  TypeShapeCollector(Array<NodeRef>& api_args) {
+    for (size_t i = 0; i < api_args.size(); i++) {
+      if (const Variable* v = api_args[i].as<Variable>()) {
+        top_arg_names.insert(v->name_hint);
+
+      } else if (auto buf = api_args[i].as<BufferNode>()) {
+        CHECK(buf->data.as<Variable>());
+        top_arg_names.insert(buf->name);
+        shape_[buf->data.get()->name_hint] = buf->shape;
+        dtype_[buf->data.get()->name_hint] = buf->dtype;
+        HCL_DEBUG_LEVEL(2) << "  [ collect shape ] " << buf->name;
+      }
+    }
+  }
+
+  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
+    auto v = op->buffer_var.get();
+    auto name = v->name_hint;
+    // Save shape and dtype information
+    shape_[name] = op->extents;
+    dtype_[name] = op->type;
+    HCL_DEBUG_LEVEL(2) << "  [ collect shape ] " << name;
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Stmt Mutate_(const KernelDef* op, const Stmt& s) {
+    for (size_t i = 0; i < op->args.size(); i++) {
+      string name = op->args[i].get()->name_hint;
+      auto shape = op->arg_shapes[i];
+      shape_[name] = shape;
+      CHECK(op->arg_types[i].as<StringImm>());
+      dtype_[name] = Str2Type(op->arg_types[i].as<StringImm>()->value);
+      HCL_DEBUG_LEVEL(2) << "  [ collect shape ] " << name;
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Type Str2Type(string type_str) {
+    if (type_str.find("int") == 0) {
+      type_str.erase(0, 3);
+      int bits = std::atoi(type_str.c_str());
+      return Int(bits);
+    } else if (type_str.find("uint") == 0) {
+      type_str.erase(0, 4);
+      int bits = std::atoi(type_str.c_str());
+      return UInt(bits);
+    } else if (type_str.find("float") == 0) {
+      type_str.erase(0, 5);
+      int bits = std::atoi(type_str.c_str());
+      return Float(bits);
+    }
+    return Int(32);
+  }
+
+  unordered_set<string> top_arg_names;
+  unordered_map<string, Array<Expr>> shape_;
+  unordered_map<string, Type> dtype_;
+};
+
+void CollectTypeShape(Stmt body, unordered_map<string, Array<Expr>>& shape,
+                      unordered_map<string, Type>& dtype,
+                      Array<NodeRef>& api_args) {
+  HCL_DEBUG_LEVEL(2) << "---------- collect shape/dtype ---------";
+  TypeShapeCollector tsc(api_args);
+  tsc.Mutate(body);
+  dtype = tsc.dtype_;
+  shape = tsc.shape_;
+}
+
+// Check all the tensors in the Stmt. Get information
+// of their access pattern (write_only, read_only or read_write)
+class BufferStatusCollector : public ir::IRMutator {
+ public:
+  Stmt Mutate_(const Store* op, const Stmt& s) {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<Store>();
+    if (!local_buffers.count(op->buffer_var.get())) {
+      updated_tensors.insert(op->buffer_var.get());
+    }
+    return stmt;
+  }
+
+  Expr Mutate_(const Load* op, const Expr& e) {
+    auto name = op->buffer_var.get()->name_hint;
+    if (!local_buffers.count(op->buffer_var.get())) {
+      input_tensors.insert(op->buffer_var.get());
+    }
+    return IRMutator::Mutate_(op, e);
+  }
+
+  Stmt Mutate_(const Allocate* op, const Stmt& s) {
+    local_buffers.insert(op->buffer_var.get());
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<Allocate>();
+    return stmt;
+  }
+
+  unordered_set<const Variable*> local_buffers;
+  unordered_set<const Variable*> updated_tensors;
+  unordered_set<const Variable*> input_tensors;
+};
+
+// Each task in the graph represents the logic performed by
+// a HCL stage. The task graph is a coarse grained DFG.
+// There is no no control flow branching across different tasks
+class TaskGraphBuilder : public IRMutator {
+ public:
+  explicit TaskGraphBuilder(Array<NodeRef> api_args) {}
+
+  Stmt Mutate_(const KernelDef* op, const Stmt& s) {
+    if (op->name == "test") {
+      device_scope_ = true;
+
+      // Save the input tensors
+      for (auto& v : op->args) {
+        kernel_input_args.push_back(v);
+      }
+      Stmt body = this->Mutate(op->body);
+      device_scope_ = false;
+      return KernelDef::make(op->args, op->arg_shapes, op->arg_types,
+                             op->arg_tensors, body, op->ret_void, op->ret_type,
+                             op->name, op->attributes);
+    } else {
+      return IRMutator::Mutate_(op, s);
+    }
+  }
+
+  Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) {
+    if (top_level_producer_ && device_scope_) {
+      top_level_producer_ = false;
+      Stmt body = this->Mutate(op->body);
+      if (op->is_producer) {
+        std::string name = op->func->func_name();
+
+        // Create a task node in the graph
+        BufferStatusCollector bsc;
+        bsc.Mutate(op->body);
+        TaskNode task = {name, bsc.updated_tensors, bsc.input_tensors, {}, {}};
+
+        // Checking depending input tensors
+        Array<Var> kernel_input_vars;
+        for (auto& input : kernel_input_args) {
+          Var v(input.node_);
+          kernel_input_vars.push_back(v);
+        }
+        Array<Var> undefs = UndefinedVars(body, kernel_input_vars);
+        // The task can be a producer of a tensor, or it will just update
+        // a set of tensors. If the input tensor is not defined in this task
+        // nor in the input arguments, then it must have been defined in the
+        // previous tasks visited in the traversal
+        for (auto& var : undefs) {
+          auto parents = checkTensorLiveness(var.get());
+          for (auto& parent_task_name : parents) {
+            task.parents.insert(parent_task_name);
+            CHECK(task_map.count(parent_task_name));
+            task_map[parent_task_name].children.insert(name);
+          }
+        }
+        task_map[name] = task;
+        HCL_DEBUG_LEVEL(2) << "[ debug ] producing tensor " << name;
+      }
+      top_level_producer_ = true;
+      return ProducerConsumer::make(op->func, op->is_producer, body);
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  // Return the nearest parent task that a tensor has been updated
+  vector<string> checkTensorLiveness(const Variable* var) {
+    // Tasks where the tensor has been updated
+    vector<string> parents;
+    for (auto& kv : task_map) {
+      for (auto& t : kv.second.updated_tensors) {
+        if (t == var) {
+          HCL_DEBUG_LEVEL(2) << "[ debug ] Tensor " << var->name_hint
+                             << " has been updated in task " << kv.second.name;
+          parents.push_back(kv.second.name);
+        }
+      }
+    }
+    return parents;
+  }
+
+  // Check the task graph inside the device scope
+  bool device_scope_{false};
+  bool top_level_producer_{true};
+  // Input tensor to the top level function
+  Array<VarExpr> kernel_input_args;
+  // Map from task name to TaskNode
+  unordered_map<string, TaskNode> task_map;
+};
+
+// 1. Locate the which tensor (in which stage) will be layout transformed
+// 2. Locate its parent task and insert the layout mutation statements
+class LayoutTransformer : public IRMutator {
+ public:
+  explicit LayoutTransformer(unordered_map<string, TaskNode>& task_map,
+                             Array<NodeRef>& api_args,
+                             vector<string> kernel_inputs)
+      : task_map_(task_map),
+        api_args_(api_args),
+        kernel_inputs_(kernel_inputs) {}
+
+  unordered_map<string, TaskNode>& task_map_;
+  Array<NodeRef>& api_args_;
+  unordered_map<string, Array<Expr>> shape_;
+  unordered_map<string, Type> dtype_;
+  vector<string> kernel_inputs_;
+
+  std::string current_producer;
+  // Map from producer key to target tensor name
+  unordered_map<string, TransformInfo> worklist;
+
+  Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) {
+    current_producer = op->func->func_name();
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Stmt Mutate_(const AttrStmt* op, const Stmt& s) {
+    // The tensor to be transformed
+    if (op->attr_key == attr::tensor_layout_attrs) {
+      VarExpr var(op->node.node_);
+      auto name = var.get()->name_hint;
+      CHECK(shape_.count(name)) << name;
+      CHECK(dtype_.count(name)) << name;
+
+      size_t pos = 0;
+      string delimiter = ":";
+      string token;
+      Array<Expr> target_shape;
+
+      CHECK(op->value.as<StringImm>());
+      string s(op->value.as<StringImm>()->value);
+
+      int target_total_width = 1;
+      while ((pos = s.find(delimiter)) != string::npos) {
+        token = s.substr(0, pos);
+        target_shape.push_back(std::stoi(token));
+        s.erase(0, pos + delimiter.length());
+        target_total_width *= std::stoi(token);
+      }
+      target_total_width *= std::stoi(s);
+      target_shape.push_back(std::stoi(s));
+
+      // Check tranform type (tranpose or packing)
+      int origin_total_width = 1;
+      for (auto& dim : shape_.at(name)) {
+        CHECK(dim.as<IntImm>());
+        origin_total_width *= dim.as<IntImm>()->value;
+      }
+
+      // TODO(Hecmay): handle reshape
+      if (origin_total_width == target_total_width) {
+        HCL_DEBUG_LEVEL(2) << "[ debug ] Transpose layout of tensor " << name
+                           << "(" << shape_[name] << ") to (" << target_shape
+                           << ")";
+
+        CHECK(dtype_.count(name));
+        TransformInfo info = {name,
+                              var,
+                              current_producer,
+                              shape_[name],
+                              target_shape,
+                              dtype_[name],
+                              dtype_[name],
+                              true,
+                              false,
+                              1,
+                              false};
+
+        if (!worklist.count(name)) {
+          worklist[name] = info;
+          // The tensor has been packed
+          // Recalculate the packing shape
+        } else {
+          Array<Expr> new_shape;
+          int shape_size = info.target_shape.size();
+          for (int k = 0; k < shape_size; k++) {
+            if (k == shape_size - 1) {
+              int factor = worklist[name].pack_factor;
+              int new_dim = info.target_shape[k].as<IntImm>()->value;
+              new_dim /= factor;
+              new_shape.push_back(new_dim);
+            } else {
+              new_shape.push_back(info.target_shape[k]);
+            }
+          }
+          worklist[name].target_shape = new_shape;
+          worklist[name].is_transpose = true;
+        }
+
+      } else {
+        // Pack the last dimension by default
+        int pack_factor = origin_total_width / target_total_width;
+        HCL_DEBUG_LEVEL(2) << "[ debug ] Pack layout of tensor " << name << "("
+                           << shape_[name] << ") to (" << op->value << ")";
+
+        Type new_type = Int(dtype_[name].bits() * pack_factor);
+        TransformInfo info = {name,         var,          current_producer,
+                              shape_[name], target_shape, dtype_[name],
+                              new_type,     false,        true,
+                              pack_factor,  false};
+
+        if (!worklist.count(name)) {
+          worklist[name] = info;
+          // if the target has been transposed
+          // first tranpose and then data-packing
+        } else {
+          Array<Expr> new_shape;
+          int shape_size = worklist[name].target_shape.size();
+          for (int k = 0; k < shape_size; k++) {
+            if (k == shape_size - 1) {
+              int factor = pack_factor;
+              int new_dim = worklist[name].target_shape[k].as<IntImm>()->value;
+              new_dim /= factor;
+              new_shape.push_back(new_dim);
+            } else {
+              new_shape.push_back(worklist[name].target_shape[k]);
+            }
+          }
+          worklist[name].target_shape = new_shape;
+          worklist[name].type = Int(pack_factor * worklist[name].type.bits());
+          worklist[name].is_pack = true;
+          worklist[name].pack_factor = pack_factor;
+        }
+      }
+
+      return this->Mutate(op->body);
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Stmt Transform(Stmt s) {
+    CollectTypeShape(s, shape_, dtype_, api_args_);
+    Stmt stmt = this->Mutate(s);
+    // Process the worklist one by one
+    for (auto& kv : worklist) {
+      auto tensor_name = kv.first;
+      auto& info = kv.second;
+      auto producer_name = info.anchor_producer;
+      CHECK(shape_.count(tensor_name)) << tensor_name;
+
+      string status = "Processing tensor " + tensor_name + "(pack:";
+      status += info.is_pack ? std::to_string(info.pack_factor) : "no";
+      status += ", transpose:";
+      status += info.is_transpose ? "yes)" : "no)";
+
+      HCL_DEBUG_LEVEL(2) << "--------------";
+      HCL_DEBUG_LEVEL(2) << "[ INFO ] " << status << ". shape "
+                         << info.target_shape << ", type " << info.type;
+
+      VarExpr new_buf(tensor_name + ".new");
+      HCL_DEBUG_LEVEL(2) << "    [ debug ] transform layout of tensor "
+                         << tensor_name << " from stage " << producer_name;
+
+      // Mutate tensor access indices from all children stages
+      stmt = UpdateBufferLayout(stmt, info, task_map_, kernel_inputs_);
+      // Insert new buffer before anchor stage
+      CHECK(task_map_.count(producer_name));
+      stmt = InsertReshapeBuffer(stmt, info, task_map_, kernel_inputs_);
+    }
+    return stmt;
+  }
+};
+
+Stmt TransformLayout(Stmt stmt, Array<NodeRef> api_args) {
+  // Restore the task graph from the IR
+  HCL_DEBUG_LEVEL(2) << "------------ Transform Layout --------------";
+  TaskGraphBuilder tgb(api_args);
+  stmt = tgb.Mutate(stmt);
+
+  // Iterate thru tensors in worklist (to be transposed or packed)
+  vector<string> kernel_inputs;
+  for (auto& arg : tgb.kernel_input_args) {
+    kernel_inputs.push_back(arg.get()->name_hint);
+  }
+  LayoutTransformer ltm(tgb.task_map, api_args, kernel_inputs);
+  stmt = ltm.Transform(stmt);
+  return stmt;
+}
+
+}  // namespace ir
+}  // namespace TVM
diff --git a/tvm/src/schedule/schedule_dataflow_rewrite.cc b/tvm/src/schedule/schedule_dataflow_rewrite.cc
index 57c06da55..dde48d605 100644
--- a/tvm/src/schedule/schedule_dataflow_rewrite.cc
+++ b/tvm/src/schedule/schedule_dataflow_rewrite.cc
@@ -175,6 +175,155 @@ class InfoUpdater final : public IRMutator {
   const bool is_sender_;
 };
 
+void Schedule::transform_layout(Stage parent, const Tensor& target,
+                                Array<Expr> shape) {
+  // Locate the stage
+  Stage target_stage = (*this)[target];
+  if (auto op = parent->op.as<ExternOpNode>()) {
+    std::string shape_str = "";
+    std::string delim = "";
+    for (auto& dim : shape) {
+      CHECK(dim.as<IntImm>());
+      shape_str += delim + std::to_string(dim.as<IntImm>()->value);
+      delim = ":";
+    }
+    Stmt new_body =
+        AttrStmt::make(VarExpr(target->op->name), attr::tensor_layout_attrs,
+                       StringImm::make(shape_str), op->body);
+
+    parent->op = ExternOpNode::make(op->name, op->tag, op->axis, op->inputs,
+                                    op->input_placeholders,
+                                    op->output_placeholders, new_body);
+  }
+}
+
+// Create multiple stages attached to the original parent stage
+Array<Tensor> Schedule::explicit_unroll(const Tensor& target,
+                                        const Array<IterVar> axes,
+                                        bool autosa) {
+  // Locate the stage
+  Stage target_stage = (*this)[target];
+  Array<Tensor> ret_tensors;
+
+  // The stage to be explicitly unrolled
+  Buffer target_buffer;
+  auto op = target_stage->op.as<ExternOpNode>();
+  CHECK(op);
+  target_buffer = op->output_placeholders[0];
+  ArrayNode* stages = (*this)->stages.CopyOnWrite();
+  size_t pos = FindNodeRef(stages, target_stage);
+
+  // Unroll the loops explicitly
+  // 1. Create sub-stages and output buffers
+  // 2. Return new body for parent stage with attaching anchors
+  CHECK_GT(axes.size(), 0);
+
+  // Update the dataflow graph
+  // 1. The parent (original) stage has new inputs
+  // 2. The newly created stages output to parent, and takes parent inputs
+  auto parent_new_inputs = op->inputs;
+  auto parent_new_input_placeholders = op->input_placeholders;
+
+  // Assume the outer axis precedes the inner ones
+  // Create PE substage array (1D flattened)
+  std::unordered_map<int, int> pe_row_number;
+  std::vector<Buffer> stage_buffers;
+  std::string unrolled_axes = "";
+  std::string delim = "";
+
+  // TODO(hecmay): support more than 2 level
+  for (int level = axes.size() - 1; level >= 0; level--) {
+    auto& axis = axes[level];
+    auto min = axis->dom->min.as<IntImm>()->value;
+    auto extent = axis->dom->extent.as<IntImm>()->value;
+    HCL_DEBUG_LEVEL(2) << "[ unrolling ] loop No. " << level << " range(" << min
+                       << "," << extent << ")";
+
+    int row_index = axes.size() - level - 1;
+    pe_row_number[row_index] = extent - min;
+
+    // innermost loop unrolling
+    for (int k = min; k < extent; k++) {
+      int replicate_times = 1;
+      // replicate unrolled inner-level PEs
+      for (auto& kv : pe_row_number) {
+        if (kv.first < row_index) {
+          replicate_times *= kv.second;
+        }
+      }
+
+      for (int r = 0; r < replicate_times; r++) {
+        std::string new_name;
+        if (axes.size() == 1) {
+          new_name = target->op->name + "_pe_" + std::to_string(k);
+        } else if (axes.size() == 2) {
+          if (row_index == 0) break;
+          new_name = target->op->name + "_pe_" + std::to_string(k) + "_" +
+                     std::to_string(r);
+        }
+
+        Array<Tensor> new_inputs = op->inputs;
+        Array<Buffer> new_input_placeholders = op->input_placeholders;
+        Array<Buffer> new_output_placeholders;
+
+        // Create op buffer node for new stage
+        Buffer new_output_buf =
+            BufferNode::make(Var(new_name, Handle()), Int(32), Array<Expr>(),
+                             Array<Expr>(), Expr(), new_name, "", 0, 0);
+        new_output_placeholders.push_back(new_output_buf);
+        stage_buffers.push_back(new_output_buf);
+
+        // Create new body for the PE
+        Stmt body = AttrStmt::make(
+            VarExpr(new_name), "kernel_scope", StringImm::make(new_name),
+            // Evaluate::make(1));
+            Evaluate::make(Call::make(Int(32), "pe", {}, Call::Intrinsic)));
+
+        // Create extern op node for the stage
+        auto new_op = ExternOpNode::make(new_name, "", Array<IterVar>(),
+                                         new_inputs, new_input_placeholders,
+                                         new_output_placeholders, body);
+        HCL_DEBUG_LEVEL(2) << "[ debug ] unrolling pe " << new_name
+                           << " body: " << body;
+
+        // Insert the output tensor
+        ret_tensors.push_back(new_op.output(0));
+        parent_new_inputs.push_back(new_op.output(0));
+        parent_new_input_placeholders.push_back(new_output_buf);
+
+        // Add stage into the DFG
+        Stage new_stage(new_op);
+        stages->data.insert(stages->data.begin() + pos, new_stage.node_);
+        (*this)->stage_map.Set(new_op, new_stage);
+      }
+    }
+    unrolled_axes += delim + axis->var.get()->name_hint;
+    delim = ",";
+  }
+
+  // Use original op in case that the stage has been tiled
+  auto origin_op = target_stage->origin_op.as<ExternOpNode>();
+  CHECK(origin_op);
+
+  // Update parent ops and body (set of attaching anchors)
+  Stmt new_body = origin_op->body;
+  Array<Expr> annotate_keys = {StringImm::make("unroll")};
+  Array<Expr> annotate_values = {StringImm::make(unrolled_axes)};
+  std::string key = (autosa) ? "autosa" : "systolic";
+  new_body = ExternModule::make(key, StringImm::make("HLS"), new_body,
+                                annotate_keys, annotate_values);
+
+  std::string parent_name = target->op->name;
+  for (auto& buffer : stage_buffers) {
+    new_body = AttrStmt::make(VarExpr(buffer.node_), "attach_scope",
+                              StringImm::make(parent_name), new_body);
+  }
+  target_stage->op = ExternOpNode::make(
+      op->name, op->tag, op->axis, parent_new_inputs,
+      parent_new_input_placeholders, op->output_placeholders, new_body);
+  return ret_tensors;
+}
+
 // Initialize static channel count
 int InfoUpdater::channelCount = 0;
 
diff --git a/tvm/src/schedule/schedule_lang.cc b/tvm/src/schedule/schedule_lang.cc
index f5d304dbc..da8b32e07 100644
--- a/tvm/src/schedule/schedule_lang.cc
+++ b/tvm/src/schedule/schedule_lang.cc
@@ -261,6 +261,21 @@ void CreateStencil(StageNode* stage, int burst_width, int unroll_factor,
                          op->input_placeholders, op->output_placeholders, body);
 }
 
+void CreateSystolic(StageNode* stage) {
+  const ExternOpNode* op = stage->op.as<ExternOpNode>();
+  Array<Expr> annotate_keys, annotate_values;
+  // Create an extern module to wrap the AutoSA generated HLS code
+  Stmt body = ExternModule::make("autosa", StringImm::make("HLS"), op->body,
+                 annotate_keys, annotate_values);
+  stage->op = ExternOpNode::make(op->name,
+                                 op->tag,
+                                 op->axis,
+                                 op->inputs,
+                                 op->input_placeholders,
+                                 op->output_placeholders,
+                                 body);
+}
+
 void CreateDataflow(StageNode* stage, IterVar var) {
   const ExternOpNode* op = stage->op.as<ExternOpNode>();
   Stmt body;
@@ -493,6 +508,11 @@ Stage& Stage::stencil(int burst_width, int unroll_factor, int num_iteration) {
   return *this;
 }
 
+Stage& Stage::systolic() { // NOLINT(*)
+  CreateSystolic(operator->());
+  return *this;
+}
+
 Stage& Stage::dataflow(IterVar var) {
   CreateDataflow(operator->(), var);
   return *this;