alibaba · zh-wei · Oct 12, 2021 · Nov 2, 2021 · Nov 28, 2021 · Dec 2, 2021
diff --git a/python/halo/halo.py b/python/halo/halo.py
@@ -62,6 +62,17 @@ class CXXCodeGenOpts(Structure):
         ("save_temps", c_bool),
     ]
 
+HALO_MODEL_INFO_MAX_OUTPUT_NR = 64
+HALO_VODLA_MAX_OUTPUT_RSC_EST = 2048
+class ModelInfo(Structure):
+    _fields_ = [
+        ("num_outputs", c_size_t),
+        ("output_buf_sizes", c_size_t*HALO_MODEL_INFO_MAX_OUTPUT_NR),
+        ("input_qps", c_int),
+        ("adaptive_bsz", c_int),
+        ("output_rsc_est", c_char*HALO_VODLA_MAX_OUTPUT_RSC_EST),
+    ]
+
 
 """
 int halo_Compile(halo::ModelFormat model_format, unsigned num_models,
@@ -91,6 +102,24 @@ class CXXCodeGenOpts(Structure):
     c_void_p,  # model_info
 ]
 
+Analyze.argtypes = [
+    c_int,  # model_format
+    c_uint,  # num_models
+    c_void_p,  # models
+    c_void_p,  # model_sizes
+    c_char_p,  # target
+    c_int,  # batch
+    c_uint,  # num_input_shapes
+    c_void_p,  # input_shapes
+    c_uint,  # num_inputs
+    c_void_p,  # inputs
+    c_uint,  # num_outputs
+    c_void_p,  # outputs
+    c_void_p,  # cg_opts
+    c_char_p,  # filename
+    c_void_p,  # model_info
+]
+
 
 def exec(args):
     proc = subprocess.run(args)
@@ -139,8 +168,8 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
 
     target = "cxx".encode("utf-8")
     output_filename = output_file.encode("utf-8")
-    logger.info("Begin Halo compilation")
-    logger.info("Halo lib:" + str(lib_halo._name))
+    logger.debug("Begin Halo compilation")
+    logger.debug("Halo lib:" + str(lib_halo._name))
     logger.debug("Intermediate file:" + str(output_filename))
     Compile(
         format_val,
@@ -150,7 +179,7 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
         target,
         batch,
         num_input_shapes,
-        (c_char_p * num_input_shapes)(*input_shapes),
+        (c_char_p * num_input_shapes)(*input_shapes), # input_shapes,
         num_inputs,
         inputs,
         num_outputs,
@@ -159,11 +188,11 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
         output_filename,
         0,
     )
-    logger.info("Done Halo Compilation")
+    logger.debug("Done Halo Compilation")
     return [output_file, output_bin]
 
 
-def AnalyzeModel(model_file, input_shapes, batch, format):
+def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
     output_file = ""
     odla_lib = cast(create_string_buffer(b""), c_char_p)
     opts = CXXCodeGenOpts()
@@ -215,10 +244,9 @@ def AnalyzeModel(model_file, input_shapes, batch, format):
         outputs,
         pointer(opts),
         output_filename,
-        0
+        pointer(model_info),
     )
 
-
 def CompileODLAModel(files, device, debug=False):
     cc_file = files[0]
     bin_file = files[1]
@@ -235,7 +263,7 @@ def CompileODLAModel(files, device, debug=False):
         str(so_file),
         str(cc_file),
         str(bin_file),
-        "-l" + device,
+        "-l" + "vodla",
         "-Wl,-rpath=/usr/local/lib",
     ]
     logger.debug("Building ODLA model: " + " ".join(args))

diff --git a/python/halo/inference.py b/python/halo/inference.py
@@ -20,6 +20,7 @@
 import sys
 import logging
 from logging import StreamHandler, Formatter
+import os
 
 
 class Inference:
@@ -31,6 +32,7 @@ def __init__(
         device,
         batch,
         format,
+        qps,
         debug,
         log_level,
     ):
@@ -53,34 +55,27 @@ def __init__(
         self.format = format
         self.device = device
         self.batch = batch
+        self.qps = qps
         self.model = None
         self.so_file = None
-        self.intermediate_files = []
-        self.save_temps = False
 
     def __del__(self):
-        self.logger.info(str(self.intermediate_files))
-        for file in self.intermediate_files:
-            if not self.save_temps:
-                Path(file).unlink()
         del self.model
 
     def Initialize(self):
-        self.logger.info("Begin initialization")
-        files = halo.CompileModel(
+        self.logger.info(f"Begin initialization;{self.model_file}")
+        self.so_file = "/usr/local/lib/libvodla.so"
+        self.model = odla.ODLAModel(self.so_file)
+        self.model.Load(
             self.model_file,
             self.input_shapes,
             self.output_names,
-            self.batch,
             self.format,
-        )
-        self.so_file = halo.CompileODLAModel(files, self.device, self.debug)
-        self.intermediate_files = [*files, self.so_file]
-        self.model = odla.ODLAModel(self.so_file)
-        self.model.Load()
+            self.batch,
+            self.qps)
         self.logger.info("Done initialization")
 
     def Run(self, data):
         if self.model is None:
             self.Initialize()
-        return self.model.Execute(data)
+        return self.model.Execute(data, self.model_file, self.batch)
diff --git a/python/halo/odla.py b/python/halo/odla.py
@@ -16,7 +16,9 @@
 from enum import Enum
 from time import time
 import logging
-
+import os
+from pathlib import Path
+from halo import halo
 
 class Device(Enum):
     CUDA = 1
@@ -37,57 +39,191 @@ def __init__(self, so_file):
         self.logger = logging.getLogger(__name__)
         self.so_file = so_file
         self.h = None
-        self.buffers = []
+        self.save_temps = False
+        self.intermediate_files = []
+
+    def __del__(self):
+        self.logger.info(str(self.intermediate_files))
+        for file in self.intermediate_files:
+            if not self.save_temps:
+                Path(file).unlink()
+        if self.h is not None:
+            self.h.odla_DestroyContext(self.ctx)
+            self.h.odla_DestroyComputation(self.comp)
+            self.h.odla_DestroyDevice(self.device)
 
-    def Load(self):
+    def Load(self,model,input_shapes,output_names,format,batch,qps):
         if self.h is None:
             self.h = CDLL(self.so_file)
         self.comp = c_void_p(0)
-        self.h.odla_CreateComputation(pointer(self.comp))
-        # TODO:
-        use_sim = c_bool(True)
-        self.h.odla_SetComputationItem(self.comp, 7, pointer(use_sim))
+        self.device = c_void_p(0)
+        model_info = halo.ModelInfo()
+        model_info.input_qps = qps
+        model_info.adaptive_bsz = batch
+        rsc_est = c_void_p(0)
+        if qps>0 and batch==1:
+            halo.AnalyzeModel(model,input_shapes,1,format,model_info)
+            rsc_est = (c_char_p)(model_info.output_rsc_est)
+        self.h.odla_AllocateDevice(c_void_p(0), 0, pointer(self.device), rsc_est)
+        self.files = halo.CompileModel(
+            model,
+            input_shapes,
+            output_names,
+            batch,
+            format,    
+        )
+        self.intermediate_files = [*self.files]
+        cc_file = str(self.files[0]).encode("utf-8")
+        bin_file = str(self.files[1]).encode("utf-8")
 
-        self.h.model_helper(self.comp)
+        self.comp = self.h.model_helper(cc_file, bin_file)
+        self.ctx = c_void_p(0)
+        self.h.odla_CreateContext(pointer(self.ctx))
         n = c_int32(-1)
         self.h.odla_GetNumOfArgsFromComputation(self.comp, pointer(n))
-        self.nr_args = n.value
+        # self.nr_args = n.value
+        if("bert" in model):
+            self.nr_args = 3
+        elif("shoucai" in model):
+            self.nr_args = 16
+        else:
+            self.nr_args = 1
 
-        nr_args = c_int32(-1)
+        # nr_args = c_int32(-1)
         self.h.odla_GetNumOfOutputsFromComputation(self.comp, pointer(n))
-        self.nr_outputs = n.value
+        # self.nr_outputs = n.value
+        if("bert" in model):
+            self.nr_outputs = 2
+        else:
+            self.nr_outputs = 1
+
         self.in_vals = []
         for idx in range(0, self.nr_args):
             arg_v = c_void_p(0)
             self.h.odla_GetArgFromComputationByIdx(self.comp, idx, pointer(arg_v))
-            vt = ValueType()
-            self.h.odla_GetValueType(arg_v, pointer(vt))
-            self.in_vals.append((arg_v.value, vt))
-
-        self.ctx = c_void_p(0)
-        self.h.odla_CreateContext(pointer(self.ctx))
 
         self.out_vals = []
         for idx in range(0, self.nr_outputs):
             out = c_void_p(0)
             self.h.odla_GetOutputFromComputationByIdx(self.comp, idx, pointer(out))
-            vt = ValueType()
-            self.h.odla_GetValueType(out, pointer(vt))
-            n = 1
-            for r in range(0, vt.shape.size):
-                n *= vt.shape.dims[r]
-            self.out_vals.append((out, vt, n))
-            buf = (c_float * n)() # FIXME: handle types
-            self.h.odla_BindToOutput(out, buf, self.ctx)
-            self.buffers.append(buf)
-
-    def Execute(self, data):
-        for idx, v in enumerate(self.in_vals):
-            self.h.odla_BindToArgument(
-                v[0], data[idx].ctypes.data_as(c_void_p), self.ctx
-            )
+
+    def Execute(self, data, model, batch):
+        print(f"model:{model},batch:{batch}")
+        # bind input
+        if("bert" in model):
+            self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(1), data[1].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(2), data[2].ctypes.data_as(c_void_p), self.ctx)
+        elif("shoucai" in model):
+            self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(1), data[1].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(2), data[2].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(3), data[3].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(4), data[4].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(5), data[5].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(6), data[6].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(7), data[7].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(8), data[8].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(9), data[9].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(10), data[10].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(11), data[11].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(12), data[12].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(13), data[13].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(14), data[14].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(15), data[15].ctypes.data_as(c_void_p), self.ctx)
+        else:
+            self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx)
+
+        # output buffer
+        buffers = []
+        if("bert" in model):
+            buf1 = (c_float * 256 * batch)()
+            buffers.append(buf1)
+            self.h.odla_BindToOutput(c_void_p(0), buf1, self.ctx)
+            buf2 = (c_float * 256 * batch)()
+            buffers.append(buf2)
+            self.h.odla_BindToOutput(c_void_p(1), buf2, self.ctx)
+        else:
+            if("resnet50" in model):
+                buf = (c_float * 1*1000 * batch)()
+            elif("dbnet" in model):
+                assert((batch==1) and "dbnet only support 1 batch.")
+                buf = (c_float * 1228800 * batch)()
+            elif("crnn" in model):
+                assert((batch==1) and "crnn only support 1 batch.")
+                buf = (c_float * 918146 * batch)()
+            elif("shoucai" in model):
+                #Add info for Shoucai Model
+                assert((batch==1) and "shoucai only support 1 batch!")
+                buf = (c_float * 425 * batch)()
+            else:
+                assert(False and f"unknown model.{model}")
+            buffers.append(buf)
+            self.h.odla_BindToOutput(c_void_p(0), buf, self.ctx)
+
+        # send data
+        if("resnet50" in model):
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[224*224*3*4*batch]), (c_int32 * 1)(*[1000*4*batch]))
+        elif("dbnet" in model):
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[1*3*960*1280*4*batch]), (c_int32 * 1)(*[1228800 * 4*batch]))
+        elif("crnn" in model):
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[63840*4*batch]), (c_int32 * 1)(*[918146*4*batch]))
+        elif("bert" in model):
+            self.h.model_data(self.ctx,  (c_int32 * 3)(*[512*4*batch, 256*4*batch, 256*8*batch]), (c_int32 * 2)(*[256*4*batch,256*4*batch]))
+        elif ("shoucai" in model):
+            #per Input File
+            #self.h.model_data(self.ctx,  (c_int32 * 16)(*[8*4*batch, 1*4*batch, 592*4*batch, 1*4*batch, 512*4*batch, 512*4*batch, 73728*4*batch, 27200*4*batch, 13600*4*batch, 13600*4*batch, 122400*4*batch, 350200*4*batch, 15552*4*batch, 1161984*4*batch, 178*4*batch, 425*8*batch]), (c_int32*1)(*[425*4*batch]))            
+
+            #per Input Shape
+            self.h.model_data(self.ctx, (c_int32 * 18)(*[425*4*batch, 8*4*batch, 1*4*batch, 592*4*batch, 1*4*batch, 512*4*batch, 512*4*batch, 73728*4*batch, 425*4*batch, 425*4*batch, 27200*4*batch,13600*4*batch, 13600*4*batch, 122400*4*batch, 350200*4*batch, 15552*4*batch, 1161984*4*batch, 178*4*batch]),  (c_int32*1)(*[425*4*batch]))            
+
+
+            '''
+            self.h.model_data(self._ctx, (c_int32 * 16)(*[8*4*batch,     \ # embedding_ui_oage_shared_embedding.txt
+                                                         1*4*batch,      \ # input_from_feature_columns_concat_3.txt
+                                                         592*4*batch,    \ # input_from_feature_columns_concat.txt
+                                                         1*4*batch,      \ #input_from_feature_columns_concat_5.txt
+                                                         512*4*batch,    \ #all_clk_seq_1_time.txt
+                                                         512*4*batch,    \#all_clk_seq_1_st.txt
+                                                         73728*4*batch,  \ #seq_input_from_feature_columns_concat_1.txt
+                                                         27200*4*batch,  \#embedding_item_id_d_shard_embedding_2.txt
+                                                         13600*4*batch,  \#embedding_item_cate_id_d_shared_embedding_2.txt
+                                                         13600*4*batch,  \#embedding_item_seller_id_d_shared_embedding_2.txt
+                                                         122400*4*batch, \#input_from_feature_columns_concat_4.txt
+                                                         350200*4*batch, \#input_from_feature_columns_concat_1.txt
+                                                         15552*4*batch,  \#seq_input_from_feature_columns_concat.txt
+                                                         1161984*4*batch,\ #seq_input_from_feature_columns_concat_2.txt
+                                                         178*4*batch,    \#input_from_feature_columns_concat_7.txt
+                                                         425*8*batch]),  \#Unique_preprocess_int64.txt
+                                         (c_int32*1)(*[425*4*batch]))   #output
+
+
+            self.h.model_data(self._ctx, (c_int32 * 18)(*[425*4*batch, \# LookupPkOP
+                                                          8*4*batch,     \ # embedding_ui_oage_shared_embedding.txt
+                                                          1*4*batch,      \ # input_from_feature_columns_concat_3.txt
+                                                          592*4*batch,    \ # input_from_feature_columns_concat.txt
+                                                          1*4*batch,      \ #input_from_feature_columns_concat_5.txt
+                                                          512*4*batch,    \ #all_clk_seq_1_time.txt
+                                                          512*4*batch,    \#all_clk_seq_1_st.txt
+                                                          73728*4*batch,  \ #seq_input_from_feature_columns_concat_1.txt
+                                                          425*4*batch,    \ #batch_fill_attributes_for_gul_rank_item_feature
+                                                          425*4*batch，   \ #batch_fill_attributes_for_gul_rank_item_feature_1
+                                                          27200*4*batch,  \#embedding_item_id_d_shard_embedding_2.txt
+                                                          13600*4*batch,  \#embedding_item_cate_id_d_shared_embedding_2.txt
+                                                          13600*4*batch,  \#embedding_item_seller_id_d_shared_embedding_2.txt
+                                                          122400*4*batch, \#input_from_feature_columns_concat_4.txt
+                                                          350200*4*batch, \#input_from_feature_columns_concat_1.txt
+                                                         15552*4*batch,  \#seq_input_from_feature_columns_concat.txt
+                                                         1161984*4*batch,\ #seq_input_from_feature_columns_concat_2.txt
+                                                         178*4*batch]),    \#input_from_feature_columns_concat_7.txt
+
+                                         (c_int32*1)(*[425*4*batch]))   #output
+            '''        
+        else:
+            assert(False and f"unknown model.{model}")
+
         s = time()
-        self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, c_void_p(0))
+        self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, self.device)
         t = time()
         self.logger.info("Execution time:" + str(t - s) + " sec(s)")
-        return self.buffers
+        return buffers