-
Notifications
You must be signed in to change notification settings - Fork 93
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Schedule] Support for intra-kernel data placement #436
base: tvm
Are you sure you want to change the base?
Changes from all commits
0fef016
22b61d9
f3aeaf0
28103b8
5848775
3f6eb0d
16d38c5
d830bab
3e71efa
7977359
861f6ad
a407632
e09052b
d5ba3b8
c1f1dc9
adc8bd1
93946c9
39c4e76
a4978a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,292 @@ | ||
import re | ||
import os | ||
import copy | ||
import sys | ||
import time | ||
from .util import run_process | ||
from .devices import Project, Platform | ||
|
||
# Static class for entries of each SA module | ||
class SystolicArrayRegistry(object): | ||
sa_module_cnt = 0 | ||
|
||
def count_SA_size(code): | ||
pos = code.rfind("PE_wrapper") | ||
function = code[pos:pos+100] | ||
dims = re.findall(" (\d+),", function) | ||
if len(dims) < 2: | ||
print("Failed to generate 2d SA. Size", dims) | ||
sys.exit() | ||
|
||
dimX, dimY = int(dims[0])+1, int(dims[1])+1 | ||
print(f"[ INFO ] generating SA dimnesion {dimX}x{dimY}.") | ||
|
||
def indent(num): | ||
return " " * num | ||
|
||
def get_function_code(name, code): | ||
pos = code.find(name) | ||
start_pos = pos - len("inline void") | ||
end_pos = code.find("/* Helper", pos) | ||
return code[start_pos:end_pos] | ||
|
||
|
||
def get_ser_size(code): | ||
lines = code.split("\n") | ||
Comment on lines
+24
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure what Python formatter HeteroCL uses, but mixing one-line space and two-line space seems weird. |
||
pattern = "<= (\d+);" | ||
size = 1 | ||
for line in lines: | ||
rets = re.findall(pattern, line) | ||
if len(rets) > 0: | ||
assert len(rets) == 1 | ||
size *= (int(rets[0])+1) | ||
else: continue | ||
return size | ||
|
||
def insert_data_pack(ret_code, header, off_chip_data, written_data): | ||
ret_code = ret_code.replace("buffer_", "").replace("[0]", "") | ||
# Extract the designated data types | ||
pattern = re.findall("autosa_func\((.*?)\)", ret_code)[0] | ||
args = pattern.split(", ") | ||
signature = re.findall("autosa_func\((.*?)\);", header) | ||
|
||
# If the arg is accessed from off-chip memory, then we repalce the typedef | ||
# with target packed data type | ||
types = signature[0].split(", ") | ||
for t in types: | ||
for arg in args: | ||
if arg in t: | ||
pattern = "_t(\d+) " | ||
target_type = re.findall(pattern, t)[0] | ||
target_type_bits = int(target_type) * 32 | ||
# Off-chip coalesced data access | ||
if arg in off_chip_data: | ||
header = f"#undef {arg}_t\n#define {arg}_t ap_uint<{target_type_bits}>\n" + header | ||
|
||
# Insert data packing and (de)serialization | ||
# Create a new buffer and reshape it to original buffer after or before AutoSA func call | ||
else: | ||
if arg in written_data: | ||
print(f"[ INFO ] Writing to on-chip memory {arg}. Packed into ap_uint<{target_type_bits}>...") | ||
# ALlocate new buffer and perform data deserialization | ||
deser_func = f"host_deserialize_{arg}" | ||
# Check if the size matches | ||
code = get_function_code(deser_func, header) | ||
size = get_ser_size(code) | ||
ret_code = ret_code.replace(arg, f"{arg}_sa") | ||
ret_code = f"float {arg}_sa[{size}];\n" + indent(5) + ret_code + \ | ||
indent(6) + f"{deser_func}({arg}, {arg}_sa);\n" | ||
else: | ||
pass | ||
|
||
return ret_code, header | ||
|
||
# Update HLS function names in the generated Extern IP core | ||
def add_prefix(header, ret_code): | ||
# Preserved function keywords in AutoSA generated code | ||
function_list = [ | ||
"autosa_func", "PE_wrapper", "PE" | ||
] | ||
index = SystolicArrayRegistry.sa_module_cnt | ||
for f in function_list: | ||
header = header.replace(f"{f}(", f" inst{index}_{f}(") | ||
ret_code = ret_code.replace(f"{f}(", f" inst{index}_{f}(") | ||
SystolicArrayRegistry.sa_module_cnt += 1 | ||
return header, ret_code | ||
|
||
def infer_default_params(loop_bounds): | ||
assert len(loop_bounds) > 1, loop_bounds | ||
extra_flags = "--simd-info=./autosa_tests/mm_hcl/simd_info.json " | ||
# Params for MatMul | ||
if len(loop_bounds) == 3: | ||
loop_bounds = [ int(_) for _ in loop_bounds ] | ||
m, n, k = loop_bounds | ||
if m > 1 and n > 1 and k > 1: | ||
ST = 3 | ||
SA_dim_x = 4 | ||
SA_dim_y = 4 | ||
PART = f"{m},{n},{k}" | ||
if m > 256 or n > 256 or k > 256: LAT = [16,16] | ||
else: LAT = [ int(m/SA_dim_x), int(n/SA_dim_y) ] | ||
LAT = [ str(1) if _ == 0 else str(_) for _ in LAT ] | ||
LAT = ",".join(LAT) | ||
SIMD = k if k <= 8 else 4 | ||
# Map reduction loop to space dim | ||
else: | ||
ST = 2 | ||
PART = "10,16" | ||
LAT = "2,2" | ||
SIMD = 4 | ||
Comment on lines
+117
to
+119
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are these magic numbers? Could you add comments or use more specific variable names here? |
||
extra_flags += "--local-reduce --reduce-op=\"+\" --simd-touch-space " | ||
|
||
# Params for Conv | ||
else: | ||
OC, OH, OW, IC, R, C = loop_bounds | ||
ST = 4 | ||
print(f"[ INFO ] input size OC({OC}), OH({OH}), OW({OW}), IC({IC}), R({R}), C({C})") | ||
PART = "16,13,13,1" | ||
LAT = "2,1,2" | ||
SIMD = "1,1,2,4" | ||
Comment on lines
+126
to
+129
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why "16,13,13,1"? I suppose this is not a test file but a general implementation. |
||
extra_flags = "--simd-info=./autosa_tests/cnn/simd_info.json " | ||
return ST, PART, LAT, SIMD, extra_flags | ||
|
||
def generate_systolic_array(keys, values, code, backend): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems the codegen, copying files, and generating headers are done in this function? Maybe it would be better if this function can be separated into several subfunctions or several steps like what we did in |
||
# Analyze packing and transpose information | ||
input_attr_info = dict() | ||
packed_data = list() | ||
transposed_data = list() | ||
|
||
is_axis_enabled = False | ||
loop_bounds = list() | ||
off_chip_data = list() | ||
written_data = list() | ||
|
||
# Process attribute information for AutoSA module | ||
for index in range(len(keys)): | ||
key = keys[index].value | ||
if key == "axis": | ||
is_axis_enabled = True | ||
continue | ||
elif key == "loop_bound": | ||
loop_bounds = values[index].value.split(",") | ||
elif key == "tensor_placement": | ||
info = values[index].value.split(",") | ||
for var in info: | ||
var_name = var.replace("[0]", "").replace("[1]", "") | ||
var_name = var_name.replace("[read]", "").replace("[write]", "") | ||
if "[0]" in var: | ||
off_chip_data.append(var_name) | ||
if "[write]" in var: | ||
written_data.append(var_name) | ||
else: | ||
try: | ||
is_transpose, pack_factor = values[index].value.split(",") | ||
input_attr_info[var] = [int(is_transpose), int(pack_factor)] | ||
if int(pack_factor) > 0: | ||
packed_data.append(var) | ||
if int(is_transpose) == 1: | ||
transposed_data.append(var) | ||
except: | ||
pass | ||
|
||
instance = SystolicArrayRegistry.sa_module_cnt | ||
autosa_c_source = f"hcl_autosa_tmp_inst{instance}.c" | ||
pwd = os.getcwd() | ||
with open(autosa_c_source, "w") as fp: | ||
fp.write("#include <stdio.h>\n") | ||
fp.write("int main(int argc, char **argv) {\n") | ||
fp.write(code) | ||
fp.write("}") | ||
|
||
header = "#include <autosa.h>\n" | ||
ret_code = "autosa_func(args);\n" | ||
|
||
# check autosa installation | ||
autosa_dir = "" | ||
try: | ||
autosa_dir = os.environ["AUTOSA"] | ||
except: | ||
print("[{}] WARNING: AutoSA not found. Please setup env variable AUTOSA".format(time.strftime("%H:%M:%S", time.gmtime()))) | ||
if autosa_dir == "": | ||
ret_code = "// Not found AutoSA. returns function placeholder\n" + indent(6) + ret_code | ||
header = "" | ||
return [header, ret_code] | ||
|
||
source_path = os.path.join(pwd, autosa_c_source) | ||
cmd = "cd {}; ".format(autosa_dir) | ||
cmd += "./autosa " | ||
cmd += "{} ".format(source_path) | ||
cmd += "--config=./autosa_config/autosa_config.json " | ||
if backend == "vhls": | ||
cmd += "--target=autosa_hls_c " | ||
elif backend == "aocl": | ||
cmd += "--target=autosa_opencl " | ||
else: | ||
raise RuntimeError(f"Illegal backend {backend}") | ||
cmd += "--output-dir=./autosa.tmp/output " | ||
|
||
# Get the default value | ||
ST, PART, LAT, SIMD, extra_flags = infer_default_params(loop_bounds) | ||
# Internal debugging interface to set up the params | ||
sa_space_time = os.getenv("SA_SPACE_TIME", ST) | ||
sa_array_part = os.getenv("SA_ARRAY_PAR", PART) | ||
sa_lat_hiding = os.getenv("SA_LAT_HIDING", LAT) | ||
sa_simd = os.getenv("SA_SIMD", SIMD) | ||
|
||
print(f"[ INFO ] AutoSA params: Array partition {sa_array_part}. Latency hiding {sa_lat_hiding}. SIMD{sa_simd}") | ||
cmd += "--sa-sizes=\"{{kernel[]->space_time[{}];".format(sa_space_time) | ||
cmd += "kernel[]->array_part[{}];".format(sa_array_part) | ||
cmd += "kernel[]->latency[{}];".format(sa_lat_hiding) | ||
cmd += "kernel[]->simd[{}]".format(sa_simd) | ||
cmd += "}\" " | ||
|
||
cmd += "--hls " | ||
cmd += "--hcl " | ||
if is_axis_enabled: | ||
pass # cmd += "--axi-stream " | ||
|
||
# configure data packing | ||
if backend == "vhls": | ||
data_pack_config = "" | ||
if len(packed_data) > 0: | ||
data_pack_config = "--data-pack-sizes=\"{" | ||
delim = "" | ||
for var in packed_data: | ||
data_pack_config += delim + "kernel[]->{}[8,32,64]".format(var) | ||
delim = ";" | ||
data_pack_config += "}\" " | ||
|
||
if data_pack_config == "": | ||
data_pack_config = "--no-data-pack " | ||
cmd += data_pack_config | ||
cmd += extra_flags | ||
|
||
# addiitonal flags for intel ocl | ||
if backend == "aocl": | ||
cmd += "--loop-infinitize --double-buffer-style=0 " | ||
|
||
# Add serialization if the SA module has interface arguements | ||
# cmd += "--host-serialize " | ||
print(f"[ INFO ] AutoSA command {cmd}") | ||
|
||
# Save autosa command for debugging purposes | ||
with open(f"hcl_autosa_cmd_inst{instance}.sh", "w") as fp: | ||
fp.write(cmd) | ||
run_process(cmd) | ||
|
||
# Extract the autosa generated code | ||
if backend == "vhls": autosa_header = f"hcl_autosa_tmp_inst{instance}_hcl_decl.h" | ||
else: autosa_header = "hcl_autosa_tmp_kernel.h" | ||
|
||
ext = "cpp" if backend == "vhls" else "cl" | ||
source_file = f"{autosa_dir}/autosa.tmp/output/src/hcl_autosa_tmp_inst{instance}_kernel.{ext}" | ||
with open(source_file, "r") as fp: | ||
header = fp.read() + "\n" | ||
header = header.replace(f"#include \"{autosa_header}\"", "") | ||
|
||
if backend == "aocl": | ||
# Also extract the helper functions for data serialization and deserialization | ||
with open(f"{autosa_dir}/autosa.tmp/output/src/hcl_autosa_tmp_host.h", "r") as f: | ||
content = f.read() | ||
annotation = "/* Helper Function */" | ||
start_pos = content.find(annotation) | ||
end_pos = content.rfind(annotation) + len(annotation) | ||
header += content[start_pos:end_pos] + "\n" | ||
|
||
# For xilinx HLS backend | ||
else: | ||
count_SA_size(header) | ||
|
||
# External module call inside top function | ||
with open(f"{autosa_dir}/autosa.tmp/output/src/{autosa_header}", "r") as fp: | ||
ret_code = fp.readlines()[0].strip() + ";\n" | ||
|
||
# Add prefix to SA functions | ||
header, ret_code = add_prefix(header, ret_code) | ||
|
||
# Bitcasting the input arguments (to AutoSA selected bit-packing factor) | ||
# 1. Substitute data type (interface arg) is decided by AutoSA (and possibly do some extra padding). | ||
# 2. Substitute data serialization size and intrinsic | ||
ret_code, header = insert_data_pack(ret_code, header, off_chip_data, written_data) | ||
|
||
return [ header, ret_code ] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -265,6 +265,61 @@ def join(self, srcs, dest=None): | |
"inconsistent tensor joining" | ||
self.sch.join(target, dest, self[src]) | ||
|
||
def transpose(self, tensor=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is one in |
||
""" transpose a tensor """ | ||
if tensor is not None: | ||
src = None | ||
if isinstance(tensor, tuple): | ||
src, tensor = tensor | ||
src = self.__getitem__(src) | ||
else: | ||
src = self.__getitem__(tensor) | ||
tensor = tensor.tensor | ||
try: | ||
shape = [ int(_.value) for _ in tensor.shape ] | ||
except: | ||
shape = [ int(_) for _ in tensor.shape ] | ||
|
||
target_shape = shape[::-1] | ||
self.cascade_tensor = tensor | ||
self.cascade_source_stage = None | ||
self.sch.transpose(src, tensor, target_shape) | ||
return self | ||
|
||
def pack(self, tensor=None, factor=512): | ||
""" pack data for data transfer """ | ||
if isinstance(tensor, list): | ||
for t in tensor: | ||
ret = self.pack(t, factor=factor) | ||
return self | ||
|
||
if tensor is not None: | ||
if isinstance(tensor, tuple): | ||
src, tensor = tensor | ||
src = self.__getitem__(src) | ||
else: | ||
src = self.__getitem__(tensor) | ||
tensor = tensor.tensor | ||
|
||
try: | ||
shape = [ int(_.value) for _ in tensor.shape ] | ||
except: | ||
shape = [ int(_) for _ in tensor.shape ] | ||
bits = types.get_bitwidth(tensor.dtype) | ||
# Calculate target shape | ||
new_shape = [1] | ||
for index in range(len((shape))): | ||
index = len(shape)-index-1 | ||
bits *= shape[index] | ||
if bits > factor: | ||
new_shape = shape[:index] + [ int(bits/factor) ] | ||
break | ||
|
||
self.cascade_tensor = tensor | ||
self.cascade_source_stage = None | ||
self.sch.transpose(src, tensor, new_shape) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a question about this |
||
|
||
return self | ||
|
||
def to(self, tensor, dst=None, src=None, axis=0, | ||
mode=_expr.IO.DMA, fifo_depth=1, burst_len=-1): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -380,6 +380,9 @@ def lower(sch, | |
stmt = ir_pass.AdjustBufferBinding(stmt, arg_list) | ||
stmt = ir_pass.InferStream(stmt, arg_list) | ||
stmt = ir_pass.AdjustBufferBinding(stmt, arg_list) | ||
# perform layout transformation | ||
stmt = ir_pass.TransformLayout(stmt, arg_list) | ||
stmt = ir_pass.AdjustBufferBinding(stmt, arg_list) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does AdjustBufferBinding do? Why is it called multiple times after each pass? |
||
for f in lower_phase3: | ||
stmt = f(stmt) | ||
if simple_mode: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
vivado_hls
has been included in the previous paths?