diff --git a/python/halo/halo.py b/python/halo/halo.py index 8853938af..a48d58621 100644 --- a/python/halo/halo.py +++ b/python/halo/halo.py @@ -62,6 +62,17 @@ class CXXCodeGenOpts(Structure): ("save_temps", c_bool), ] +HALO_MODEL_INFO_MAX_OUTPUT_NR = 64 +HALO_VODLA_MAX_OUTPUT_RSC_EST = 2048 +class ModelInfo(Structure): + _fields_ = [ + ("num_outputs", c_size_t), + ("output_buf_sizes", c_size_t*HALO_MODEL_INFO_MAX_OUTPUT_NR), + ("input_qps", c_int), + ("adaptive_bsz", c_int), + ("output_rsc_est", c_char*HALO_VODLA_MAX_OUTPUT_RSC_EST), + ] + """ int halo_Compile(halo::ModelFormat model_format, unsigned num_models, @@ -91,6 +102,24 @@ class CXXCodeGenOpts(Structure): c_void_p, # model_info ] +Analyze.argtypes = [ + c_int, # model_format + c_uint, # num_models + c_void_p, # models + c_void_p, # model_sizes + c_char_p, # target + c_int, # batch + c_uint, # num_input_shapes + c_void_p, # input_shapes + c_uint, # num_inputs + c_void_p, # inputs + c_uint, # num_outputs + c_void_p, # outputs + c_void_p, # cg_opts + c_char_p, # filename + c_void_p, # model_info +] + def exec(args): proc = subprocess.run(args) @@ -139,8 +168,8 @@ def CompileModel(model_file, input_shapes, output_names, batch, format): target = "cxx".encode("utf-8") output_filename = output_file.encode("utf-8") - logger.info("Begin Halo compilation") - logger.info("Halo lib:" + str(lib_halo._name)) + logger.debug("Begin Halo compilation") + logger.debug("Halo lib:" + str(lib_halo._name)) logger.debug("Intermediate file:" + str(output_filename)) Compile( format_val, @@ -150,7 +179,7 @@ def CompileModel(model_file, input_shapes, output_names, batch, format): target, batch, num_input_shapes, - (c_char_p * num_input_shapes)(*input_shapes), + (c_char_p * num_input_shapes)(*input_shapes), # input_shapes, num_inputs, inputs, num_outputs, @@ -159,11 +188,11 @@ def CompileModel(model_file, input_shapes, output_names, batch, format): output_filename, 0, ) - logger.info("Done Halo Compilation") + logger.debug("Done Halo Compilation") return [output_file, output_bin] -def AnalyzeModel(model_file, input_shapes, batch, format): +def AnalyzeModel(model_file, input_shapes, batch, format, model_info): output_file = "" odla_lib = cast(create_string_buffer(b""), c_char_p) opts = CXXCodeGenOpts() @@ -215,10 +244,9 @@ def AnalyzeModel(model_file, input_shapes, batch, format): outputs, pointer(opts), output_filename, - 0 + pointer(model_info), ) - def CompileODLAModel(files, device, debug=False): cc_file = files[0] bin_file = files[1] @@ -235,7 +263,7 @@ def CompileODLAModel(files, device, debug=False): str(so_file), str(cc_file), str(bin_file), - "-l" + device, + "-l" + "vodla", "-Wl,-rpath=/usr/local/lib", ] logger.debug("Building ODLA model: " + " ".join(args)) diff --git a/python/halo/inference.py b/python/halo/inference.py index 1ac879de4..022457c43 100644 --- a/python/halo/inference.py +++ b/python/halo/inference.py @@ -20,6 +20,7 @@ import sys import logging from logging import StreamHandler, Formatter +import os class Inference: @@ -31,6 +32,7 @@ def __init__( device, batch, format, + qps, debug, log_level, ): @@ -53,34 +55,27 @@ def __init__( self.format = format self.device = device self.batch = batch + self.qps = qps self.model = None self.so_file = None - self.intermediate_files = [] - self.save_temps = False def __del__(self): - self.logger.info(str(self.intermediate_files)) - for file in self.intermediate_files: - if not self.save_temps: - Path(file).unlink() del self.model def Initialize(self): - self.logger.info("Begin initialization") - files = halo.CompileModel( + self.logger.info(f"Begin initialization;{self.model_file}") + self.so_file = "/usr/local/lib/libvodla.so" + self.model = odla.ODLAModel(self.so_file) + self.model.Load( self.model_file, self.input_shapes, self.output_names, - self.batch, self.format, - ) - self.so_file = halo.CompileODLAModel(files, self.device, self.debug) - self.intermediate_files = [*files, self.so_file] - self.model = odla.ODLAModel(self.so_file) - self.model.Load() + self.batch, + self.qps) self.logger.info("Done initialization") def Run(self, data): if self.model is None: self.Initialize() - return self.model.Execute(data) + return self.model.Execute(data, self.model_file, self.batch) diff --git a/python/halo/odla.py b/python/halo/odla.py index 068848c17..394c85d02 100644 --- a/python/halo/odla.py +++ b/python/halo/odla.py @@ -16,7 +16,9 @@ from enum import Enum from time import time import logging - +import os +from pathlib import Path +from halo import halo class Device(Enum): CUDA = 1 @@ -37,57 +39,191 @@ def __init__(self, so_file): self.logger = logging.getLogger(__name__) self.so_file = so_file self.h = None - self.buffers = [] + self.save_temps = False + self.intermediate_files = [] + + def __del__(self): + self.logger.info(str(self.intermediate_files)) + for file in self.intermediate_files: + if not self.save_temps: + Path(file).unlink() + if self.h is not None: + self.h.odla_DestroyContext(self.ctx) + self.h.odla_DestroyComputation(self.comp) + self.h.odla_DestroyDevice(self.device) - def Load(self): + def Load(self,model,input_shapes,output_names,format,batch,qps): if self.h is None: self.h = CDLL(self.so_file) self.comp = c_void_p(0) - self.h.odla_CreateComputation(pointer(self.comp)) - # TODO: - use_sim = c_bool(True) - self.h.odla_SetComputationItem(self.comp, 7, pointer(use_sim)) + self.device = c_void_p(0) + model_info = halo.ModelInfo() + model_info.input_qps = qps + model_info.adaptive_bsz = batch + rsc_est = c_void_p(0) + if qps>0 and batch==1: + halo.AnalyzeModel(model,input_shapes,1,format,model_info) + rsc_est = (c_char_p)(model_info.output_rsc_est) + self.h.odla_AllocateDevice(c_void_p(0), 0, pointer(self.device), rsc_est) + self.files = halo.CompileModel( + model, + input_shapes, + output_names, + batch, + format, + ) + self.intermediate_files = [*self.files] + cc_file = str(self.files[0]).encode("utf-8") + bin_file = str(self.files[1]).encode("utf-8") - self.h.model_helper(self.comp) + self.comp = self.h.model_helper(cc_file, bin_file) + self.ctx = c_void_p(0) + self.h.odla_CreateContext(pointer(self.ctx)) n = c_int32(-1) self.h.odla_GetNumOfArgsFromComputation(self.comp, pointer(n)) - self.nr_args = n.value + # self.nr_args = n.value + if("bert" in model): + self.nr_args = 3 + elif("shoucai" in model): + self.nr_args = 16 + else: + self.nr_args = 1 - nr_args = c_int32(-1) + # nr_args = c_int32(-1) self.h.odla_GetNumOfOutputsFromComputation(self.comp, pointer(n)) - self.nr_outputs = n.value + # self.nr_outputs = n.value + if("bert" in model): + self.nr_outputs = 2 + else: + self.nr_outputs = 1 + self.in_vals = [] for idx in range(0, self.nr_args): arg_v = c_void_p(0) self.h.odla_GetArgFromComputationByIdx(self.comp, idx, pointer(arg_v)) - vt = ValueType() - self.h.odla_GetValueType(arg_v, pointer(vt)) - self.in_vals.append((arg_v.value, vt)) - - self.ctx = c_void_p(0) - self.h.odla_CreateContext(pointer(self.ctx)) self.out_vals = [] for idx in range(0, self.nr_outputs): out = c_void_p(0) self.h.odla_GetOutputFromComputationByIdx(self.comp, idx, pointer(out)) - vt = ValueType() - self.h.odla_GetValueType(out, pointer(vt)) - n = 1 - for r in range(0, vt.shape.size): - n *= vt.shape.dims[r] - self.out_vals.append((out, vt, n)) - buf = (c_float * n)() # FIXME: handle types - self.h.odla_BindToOutput(out, buf, self.ctx) - self.buffers.append(buf) - - def Execute(self, data): - for idx, v in enumerate(self.in_vals): - self.h.odla_BindToArgument( - v[0], data[idx].ctypes.data_as(c_void_p), self.ctx - ) + + def Execute(self, data, model, batch): + print(f"model:{model},batch:{batch}") + # bind input + if("bert" in model): + self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(1), data[1].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(2), data[2].ctypes.data_as(c_void_p), self.ctx) + elif("shoucai" in model): + self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(1), data[1].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(2), data[2].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(3), data[3].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(4), data[4].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(5), data[5].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(6), data[6].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(7), data[7].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(8), data[8].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(9), data[9].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(10), data[10].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(11), data[11].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(12), data[12].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(13), data[13].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(14), data[14].ctypes.data_as(c_void_p), self.ctx) + self.h.odla_BindToArgument(c_void_p(15), data[15].ctypes.data_as(c_void_p), self.ctx) + else: + self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx) + + # output buffer + buffers = [] + if("bert" in model): + buf1 = (c_float * 256 * batch)() + buffers.append(buf1) + self.h.odla_BindToOutput(c_void_p(0), buf1, self.ctx) + buf2 = (c_float * 256 * batch)() + buffers.append(buf2) + self.h.odla_BindToOutput(c_void_p(1), buf2, self.ctx) + else: + if("resnet50" in model): + buf = (c_float * 1*1000 * batch)() + elif("dbnet" in model): + assert((batch==1) and "dbnet only support 1 batch.") + buf = (c_float * 1228800 * batch)() + elif("crnn" in model): + assert((batch==1) and "crnn only support 1 batch.") + buf = (c_float * 918146 * batch)() + elif("shoucai" in model): + #Add info for Shoucai Model + assert((batch==1) and "shoucai only support 1 batch!") + buf = (c_float * 425 * batch)() + else: + assert(False and f"unknown model.{model}") + buffers.append(buf) + self.h.odla_BindToOutput(c_void_p(0), buf, self.ctx) + + # send data + if("resnet50" in model): + self.h.model_data(self.ctx, (c_int32 * 1)(*[224*224*3*4*batch]), (c_int32 * 1)(*[1000*4*batch])) + elif("dbnet" in model): + self.h.model_data(self.ctx, (c_int32 * 1)(*[1*3*960*1280*4*batch]), (c_int32 * 1)(*[1228800 * 4*batch])) + elif("crnn" in model): + self.h.model_data(self.ctx, (c_int32 * 1)(*[63840*4*batch]), (c_int32 * 1)(*[918146*4*batch])) + elif("bert" in model): + self.h.model_data(self.ctx, (c_int32 * 3)(*[512*4*batch, 256*4*batch, 256*8*batch]), (c_int32 * 2)(*[256*4*batch,256*4*batch])) + elif ("shoucai" in model): + #per Input File + #self.h.model_data(self.ctx, (c_int32 * 16)(*[8*4*batch, 1*4*batch, 592*4*batch, 1*4*batch, 512*4*batch, 512*4*batch, 73728*4*batch, 27200*4*batch, 13600*4*batch, 13600*4*batch, 122400*4*batch, 350200*4*batch, 15552*4*batch, 1161984*4*batch, 178*4*batch, 425*8*batch]), (c_int32*1)(*[425*4*batch])) + + #per Input Shape + self.h.model_data(self.ctx, (c_int32 * 18)(*[425*4*batch, 8*4*batch, 1*4*batch, 592*4*batch, 1*4*batch, 512*4*batch, 512*4*batch, 73728*4*batch, 425*4*batch, 425*4*batch, 27200*4*batch,13600*4*batch, 13600*4*batch, 122400*4*batch, 350200*4*batch, 15552*4*batch, 1161984*4*batch, 178*4*batch]), (c_int32*1)(*[425*4*batch])) + + + ''' + self.h.model_data(self._ctx, (c_int32 * 16)(*[8*4*batch, \ # embedding_ui_oage_shared_embedding.txt + 1*4*batch, \ # input_from_feature_columns_concat_3.txt + 592*4*batch, \ # input_from_feature_columns_concat.txt + 1*4*batch, \ #input_from_feature_columns_concat_5.txt + 512*4*batch, \ #all_clk_seq_1_time.txt + 512*4*batch, \#all_clk_seq_1_st.txt + 73728*4*batch, \ #seq_input_from_feature_columns_concat_1.txt + 27200*4*batch, \#embedding_item_id_d_shard_embedding_2.txt + 13600*4*batch, \#embedding_item_cate_id_d_shared_embedding_2.txt + 13600*4*batch, \#embedding_item_seller_id_d_shared_embedding_2.txt + 122400*4*batch, \#input_from_feature_columns_concat_4.txt + 350200*4*batch, \#input_from_feature_columns_concat_1.txt + 15552*4*batch, \#seq_input_from_feature_columns_concat.txt + 1161984*4*batch,\ #seq_input_from_feature_columns_concat_2.txt + 178*4*batch, \#input_from_feature_columns_concat_7.txt + 425*8*batch]), \#Unique_preprocess_int64.txt + (c_int32*1)(*[425*4*batch])) #output + + + self.h.model_data(self._ctx, (c_int32 * 18)(*[425*4*batch, \# LookupPkOP + 8*4*batch, \ # embedding_ui_oage_shared_embedding.txt + 1*4*batch, \ # input_from_feature_columns_concat_3.txt + 592*4*batch, \ # input_from_feature_columns_concat.txt + 1*4*batch, \ #input_from_feature_columns_concat_5.txt + 512*4*batch, \ #all_clk_seq_1_time.txt + 512*4*batch, \#all_clk_seq_1_st.txt + 73728*4*batch, \ #seq_input_from_feature_columns_concat_1.txt + 425*4*batch, \ #batch_fill_attributes_for_gul_rank_item_feature + 425*4*batch, \ #batch_fill_attributes_for_gul_rank_item_feature_1 + 27200*4*batch, \#embedding_item_id_d_shard_embedding_2.txt + 13600*4*batch, \#embedding_item_cate_id_d_shared_embedding_2.txt + 13600*4*batch, \#embedding_item_seller_id_d_shared_embedding_2.txt + 122400*4*batch, \#input_from_feature_columns_concat_4.txt + 350200*4*batch, \#input_from_feature_columns_concat_1.txt + 15552*4*batch, \#seq_input_from_feature_columns_concat.txt + 1161984*4*batch,\ #seq_input_from_feature_columns_concat_2.txt + 178*4*batch]), \#input_from_feature_columns_concat_7.txt + + (c_int32*1)(*[425*4*batch])) #output + ''' + else: + assert(False and f"unknown model.{model}") + s = time() - self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, c_void_p(0)) + self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, self.device) t = time() self.logger.info("Execution time:" + str(t - s) + " sec(s)") - return self.buffers + return buffers