From 372b7325f07ea935393776bbeb7e681cc97cbd21 Mon Sep 17 00:00:00 2001
From: "maruiyan.mry" <maruiyan.mry@alibaba-inc.com>
Date: Tue, 12 Oct 2021 01:04:58 -0700
Subject: [PATCH 1/6] feat:support vodla demo

---
 python/halo/halo.py      |  10 ++--
 python/halo/inference.py |  15 +++--
 python/halo/odla.py      | 116 ++++++++++++++++++++++++++++-----------
 3 files changed, 99 insertions(+), 42 deletions(-)

diff --git a/python/halo/halo.py b/python/halo/halo.py
index 8853938af..7d55ba37d 100644
--- a/python/halo/halo.py
+++ b/python/halo/halo.py
@@ -139,8 +139,8 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
 
     target = "cxx".encode("utf-8")
     output_filename = output_file.encode("utf-8")
-    logger.info("Begin Halo compilation")
-    logger.info("Halo lib:" + str(lib_halo._name))
+    logger.debug("Begin Halo compilation")
+    logger.debug("Halo lib:" + str(lib_halo._name))
     logger.debug("Intermediate file:" + str(output_filename))
     Compile(
         format_val,
@@ -150,7 +150,7 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
         target,
         batch,
         num_input_shapes,
-        (c_char_p * num_input_shapes)(*input_shapes),
+        (c_char_p * num_input_shapes)(*input_shapes), # input_shapes,
         num_inputs,
         inputs,
         num_outputs,
@@ -159,7 +159,7 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
         output_filename,
         0,
     )
-    logger.info("Done Halo Compilation")
+    logger.debug("Done Halo Compilation")
     return [output_file, output_bin]
 
 
@@ -235,7 +235,7 @@ def CompileODLAModel(files, device, debug=False):
         str(so_file),
         str(cc_file),
         str(bin_file),
-        "-l" + device,
+        "-l" + "vodla",
         "-Wl,-rpath=/usr/local/lib",
     ]
     logger.debug("Building ODLA model: " + " ".join(args))
diff --git a/python/halo/inference.py b/python/halo/inference.py
index 1ac879de4..508a95197 100644
--- a/python/halo/inference.py
+++ b/python/halo/inference.py
@@ -20,6 +20,7 @@
 import sys
 import logging
 from logging import StreamHandler, Formatter
+import os
 
 
 class Inference:
@@ -57,6 +58,7 @@ def __init__(
         self.so_file = None
         self.intermediate_files = []
         self.save_temps = False
+        print(f"self.batch:{self.batch}")
 
     def __del__(self):
         self.logger.info(str(self.intermediate_files))
@@ -66,7 +68,7 @@ def __del__(self):
         del self.model
 
     def Initialize(self):
-        self.logger.info("Begin initialization")
+        self.logger.info(f"Begin initialization;{self.model_file}")
         files = halo.CompileModel(
             self.model_file,
             self.input_shapes,
@@ -74,13 +76,14 @@ def Initialize(self):
             self.batch,
             self.format,
         )
-        self.so_file = halo.CompileODLAModel(files, self.device, self.debug)
-        self.intermediate_files = [*files, self.so_file]
-        self.model = odla.ODLAModel(self.so_file)
-        self.model.Load()
+        # self.so_file = halo.CompileODLAModel(files, self.device, self.debug)
+        self.so_file = "/usr/local/lib/libvodla.so"
+        self.intermediate_files = [*files]
+        self.model = odla.ODLAModel(self.so_file,files)
+        self.model.Load(self.model_file)
         self.logger.info("Done initialization")
 
     def Run(self, data):
         if self.model is None:
             self.Initialize()
-        return self.model.Execute(data)
+        return self.model.Execute(data, self.model_file, self.batch)
diff --git a/python/halo/odla.py b/python/halo/odla.py
index 068848c17..7843fd292 100644
--- a/python/halo/odla.py
+++ b/python/halo/odla.py
@@ -16,7 +16,8 @@
 from enum import Enum
 from time import time
 import logging
-
+import os
+from pdb import set_trace
 
 class Device(Enum):
     CUDA = 1
@@ -33,61 +34,114 @@ class ValueType(Structure):
 
 
 class ODLAModel:
-    def __init__(self, so_file):
+    def __init__(self, so_file, files):
         self.logger = logging.getLogger(__name__)
         self.so_file = so_file
         self.h = None
         self.buffers = []
+        self.files = files
 
-    def Load(self):
+    def Load(self,model):
         if self.h is None:
             self.h = CDLL(self.so_file)
         self.comp = c_void_p(0)
+        self.device = c_void_p(0)
+        self.h.odla_AllocateDevice(c_void_p(0), 0, pointer(self.device))
         self.h.odla_CreateComputation(pointer(self.comp))
         # TODO:
-        use_sim = c_bool(True)
-        self.h.odla_SetComputationItem(self.comp, 7, pointer(use_sim))
+        # use_sim = c_bool(True)
+        # self.h.odla_SetComputationItem(self.comp, 7, pointer(use_sim))
+        cc_file = str(self.files[0]).encode("utf-8")
+        bin_file = str(self.files[1]).encode("utf-8")
 
-        self.h.model_helper(self.comp)
+        self.comp = self.h.model_helper(cc_file, bin_file)
+        self.ctx = c_void_p(0)
+        self.h.odla_CreateContext(pointer(self.ctx))
         n = c_int32(-1)
         self.h.odla_GetNumOfArgsFromComputation(self.comp, pointer(n))
-        self.nr_args = n.value
+        # self.nr_args = n.value
+        if("bert" in model):
+            print("bert_model input num.")
+            self.nr_args = 3
+        else:
+            self.nr_args = 1
 
-        nr_args = c_int32(-1)
+        # nr_args = c_int32(-1)
         self.h.odla_GetNumOfOutputsFromComputation(self.comp, pointer(n))
-        self.nr_outputs = n.value
+        # self.nr_outputs = n.value
+        if("bert" in model):
+            print("bert_model output num.")
+            self.nr_outputs = 2
+        else:
+            self.nr_outputs = 1
+
         self.in_vals = []
         for idx in range(0, self.nr_args):
             arg_v = c_void_p(0)
             self.h.odla_GetArgFromComputationByIdx(self.comp, idx, pointer(arg_v))
-            vt = ValueType()
-            self.h.odla_GetValueType(arg_v, pointer(vt))
-            self.in_vals.append((arg_v.value, vt))
+            # vt = ValueType()
+            # self.h.odla_GetValueType(arg_v, pointer(vt))
+            # self.in_vals.append((arg_v.value, vt))
 
-        self.ctx = c_void_p(0)
-        self.h.odla_CreateContext(pointer(self.ctx))
+        # self.ctx = c_void_p(0)
+        # self.h.odla_CreateContext(pointer(self.ctx))
 
         self.out_vals = []
         for idx in range(0, self.nr_outputs):
             out = c_void_p(0)
             self.h.odla_GetOutputFromComputationByIdx(self.comp, idx, pointer(out))
-            vt = ValueType()
-            self.h.odla_GetValueType(out, pointer(vt))
-            n = 1
-            for r in range(0, vt.shape.size):
-                n *= vt.shape.dims[r]
-            self.out_vals.append((out, vt, n))
-            buf = (c_float * n)() # FIXME: handle types
-            self.h.odla_BindToOutput(out, buf, self.ctx)
-            self.buffers.append(buf)
-
-    def Execute(self, data):
-        for idx, v in enumerate(self.in_vals):
-            self.h.odla_BindToArgument(
-                v[0], data[idx].ctypes.data_as(c_void_p), self.ctx
-            )
+            # vt = ValueType()
+            # self.h.odla_GetValueType(out, pointer(vt))
+            # n = 1
+            # for r in range(0, vt.shape.size):
+            #     n *= vt.shape.dims[r]
+            # self.out_vals.append((out, vt, n))
+            # buf = (c_float * n)() # FIXME: handle types
+            # self.h.odla_BindToOutput(out, buf, self.ctx)
+            # self.buffers.append(buf)
+
+    def Execute(self, data, model, batch):
+        print(f"model:{model},batch:{batch}")
+        # for idx, v in enumerate(self.in_vals):
+        #     self.h.odla_BindToArgument(
+        #         v[0], data[idx].ctypes.data_as(c_void_p), self.ctx
+        #     )
+        self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx)
+        # output buffer
+        buffers = []
+        if("bert" in model):
+            buf1 = (c_float * 256 * batch)() 
+            buffers.append(buf1)
+            self.h.odla_BindToOutput(c_void_p(0), buf1, self.ctx)
+            buf2 = (c_float * 256 * batch)()
+            buffers.append(buf2)
+            self.h.odla_BindToOutput(c_void_p(0), buf2, self.ctx)
+        else:
+            if("resnet50" in model):
+                buf = (c_float * 1*1000 * batch)()
+            elif("dbnet" in model):
+                buf = (c_float * 1228800 * batch)()
+            elif("crnn" in model):
+                buf = (c_float * 918146 * batch)()
+            else:
+                assert(False and f"unknown model.{model}")
+            buffers.append(buf)
+            self.h.odla_BindToOutput(c_void_p(0), buf, self.ctx)
+
+        if("resnet50" in model):
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[224*224*3*4]), (c_int32 * 1)(*[1000*4]))
+        elif("dbnet" in model):
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[1*3*960*1280*4]), (c_int32 * 1)(*[1228800 * 4]))
+        elif("crnn" in model):
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[63840*4]), (c_int32 * 1)(*[918146*4]))
+        elif("bert" in model):
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[512*4, 256*4, 256*4]), (c_int32 * 1)(*[256*4,256*4]))
+        else:
+            assert(False and f"unknown model.{model}")
+    
         s = time()
-        self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, c_void_p(0))
+        # self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, c_void_p(0))
+        self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, self.device)
         t = time()
         self.logger.info("Execution time:" + str(t - s) + " sec(s)")
-        return self.buffers
+        return buffers

From c5886f0990b9127c7fd72bf87921f0f0d25e9fcd Mon Sep 17 00:00:00 2001
From: tianboh <54729592+tianboh@users.noreply.github.com>
Date: Tue, 2 Nov 2021 17:19:11 +0800
Subject: [PATCH 2/6] add odla.py support for shoucai (#664)

---
 python/halo/odla.py | 119 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 90 insertions(+), 29 deletions(-)

diff --git a/python/halo/odla.py b/python/halo/odla.py
index 7843fd292..d6acdc447 100644
--- a/python/halo/odla.py
+++ b/python/halo/odla.py
@@ -18,6 +18,7 @@
 import logging
 import os
 from pdb import set_trace
+import numpy as np
 
 class Device(Enum):
     CUDA = 1
@@ -61,8 +62,9 @@ def Load(self,model):
         self.h.odla_GetNumOfArgsFromComputation(self.comp, pointer(n))
         # self.nr_args = n.value
         if("bert" in model):
-            print("bert_model input num.")
             self.nr_args = 3
+        elif("shoucai" in model):
+            self.nr_args = 16
         else:
             self.nr_args = 1
 
@@ -70,7 +72,6 @@ def Load(self,model):
         self.h.odla_GetNumOfOutputsFromComputation(self.comp, pointer(n))
         # self.nr_outputs = n.value
         if("bert" in model):
-            print("bert_model output num.")
             self.nr_outputs = 2
         else:
             self.nr_outputs = 1
@@ -79,68 +80,128 @@ def Load(self,model):
         for idx in range(0, self.nr_args):
             arg_v = c_void_p(0)
             self.h.odla_GetArgFromComputationByIdx(self.comp, idx, pointer(arg_v))
-            # vt = ValueType()
-            # self.h.odla_GetValueType(arg_v, pointer(vt))
-            # self.in_vals.append((arg_v.value, vt))
-
-        # self.ctx = c_void_p(0)
-        # self.h.odla_CreateContext(pointer(self.ctx))
 
         self.out_vals = []
         for idx in range(0, self.nr_outputs):
             out = c_void_p(0)
             self.h.odla_GetOutputFromComputationByIdx(self.comp, idx, pointer(out))
-            # vt = ValueType()
-            # self.h.odla_GetValueType(out, pointer(vt))
-            # n = 1
-            # for r in range(0, vt.shape.size):
-            #     n *= vt.shape.dims[r]
-            # self.out_vals.append((out, vt, n))
-            # buf = (c_float * n)() # FIXME: handle types
-            # self.h.odla_BindToOutput(out, buf, self.ctx)
-            # self.buffers.append(buf)
 
     def Execute(self, data, model, batch):
         print(f"model:{model},batch:{batch}")
-        # for idx, v in enumerate(self.in_vals):
-        #     self.h.odla_BindToArgument(
-        #         v[0], data[idx].ctypes.data_as(c_void_p), self.ctx
-        #     )
-        self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx)
+        # bind input
+        if("bert" in model):
+            self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(1), data[1].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(2), data[2].ctypes.data_as(c_void_p), self.ctx)
+        elif("shoucai" in model):
+            self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(1), data[1].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(2), data[2].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(3), data[3].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(4), data[4].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(5), data[5].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(6), data[6].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(7), data[7].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(8), data[8].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(9), data[9].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(10), data[10].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(11), data[11].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(12), data[12].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(13), data[13].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(14), data[14].ctypes.data_as(c_void_p), self.ctx)
+            self.h.odla_BindToArgument(c_void_p(15), data[15].ctypes.data_as(c_void_p), self.ctx)
+        else:
+            self.h.odla_BindToArgument(c_void_p(0), data[0].ctypes.data_as(c_void_p), self.ctx)
+
         # output buffer
         buffers = []
         if("bert" in model):
-            buf1 = (c_float * 256 * batch)() 
+            buf1 = (c_float * 256 * batch)()
             buffers.append(buf1)
             self.h.odla_BindToOutput(c_void_p(0), buf1, self.ctx)
             buf2 = (c_float * 256 * batch)()
             buffers.append(buf2)
-            self.h.odla_BindToOutput(c_void_p(0), buf2, self.ctx)
+            self.h.odla_BindToOutput(c_void_p(1), buf2, self.ctx)
         else:
             if("resnet50" in model):
                 buf = (c_float * 1*1000 * batch)()
             elif("dbnet" in model):
+                assert((batch==1) and "dbnet only support 1 batch.")
                 buf = (c_float * 1228800 * batch)()
             elif("crnn" in model):
+                assert((batch==1) and "crnn only support 1 batch.")
                 buf = (c_float * 918146 * batch)()
+            elif("shoucai" in model):
+                #Add info for Shoucai Model
+                assert((batch==1) and "shoucai only support 1 batch!")
+                buf = (c_float * 425 * batch)()
             else:
                 assert(False and f"unknown model.{model}")
             buffers.append(buf)
             self.h.odla_BindToOutput(c_void_p(0), buf, self.ctx)
 
+        # send data
         if("resnet50" in model):
-            self.h.model_data(self.ctx,  (c_int32 * 1)(*[224*224*3*4]), (c_int32 * 1)(*[1000*4]))
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[224*224*3*4*batch]), (c_int32 * 1)(*[1000*4*batch]))
         elif("dbnet" in model):
-            self.h.model_data(self.ctx,  (c_int32 * 1)(*[1*3*960*1280*4]), (c_int32 * 1)(*[1228800 * 4]))
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[1*3*960*1280*4*batch]), (c_int32 * 1)(*[1228800 * 4*batch]))
         elif("crnn" in model):
-            self.h.model_data(self.ctx,  (c_int32 * 1)(*[63840*4]), (c_int32 * 1)(*[918146*4]))
+            self.h.model_data(self.ctx,  (c_int32 * 1)(*[63840*4*batch]), (c_int32 * 1)(*[918146*4*batch]))
         elif("bert" in model):
-            self.h.model_data(self.ctx,  (c_int32 * 1)(*[512*4, 256*4, 256*4]), (c_int32 * 1)(*[256*4,256*4]))
+            self.h.model_data(self.ctx,  (c_int32 * 3)(*[512*4*batch, 256*4*batch, 256*8*batch]), (c_int32 * 2)(*[256*4*batch,256*4*batch]))
+        elif ("shoucai" in model):
+            #per Input File
+            #self.h.model_data(self.ctx,  (c_int32 * 16)(*[8*4*batch, 1*4*batch, 592*4*batch, 1*4*batch, 512*4*batch, 512*4*batch, 73728*4*batch, 27200*4*batch, 13600*4*batch, 13600*4*batch, 122400*4*batch, 350200*4*batch, 15552*4*batch, 1161984*4*batch, 178*4*batch, 425*8*batch]), (c_int32*1)(*[425*4*batch]))            
+            
+            #per Input Shape
+            self.h.model_data(self.ctx, (c_int32 * 18)(*[425*4*batch, 8*4*batch, 1*4*batch, 592*4*batch, 1*4*batch, 512*4*batch, 512*4*batch, 73728*4*batch, 425*4*batch, 425*4*batch, 27200*4*batch,13600*4*batch, 13600*4*batch, 122400*4*batch, 350200*4*batch, 15552*4*batch, 1161984*4*batch, 178*4*batch]),  (c_int32*1)(*[425*4*batch]))            
+
+
+            '''
+            self.h.model_data(self._ctx, (c_int32 * 16)(*[8*4*batch,     \ # embedding_ui_oage_shared_embedding.txt
+                                                         1*4*batch,      \ # input_from_feature_columns_concat_3.txt
+                                                         592*4*batch,    \ # input_from_feature_columns_concat.txt
+                                                         1*4*batch,      \ #input_from_feature_columns_concat_5.txt
+                                                         512*4*batch,    \ #all_clk_seq_1_time.txt
+                                                         512*4*batch,    \#all_clk_seq_1_st.txt
+                                                         73728*4*batch,  \ #seq_input_from_feature_columns_concat_1.txt
+                                                         27200*4*batch,  \#embedding_item_id_d_shard_embedding_2.txt
+                                                         13600*4*batch,  \#embedding_item_cate_id_d_shared_embedding_2.txt
+                                                         13600*4*batch,  \#embedding_item_seller_id_d_shared_embedding_2.txt
+                                                         122400*4*batch, \#input_from_feature_columns_concat_4.txt
+                                                         350200*4*batch, \#input_from_feature_columns_concat_1.txt
+                                                         15552*4*batch,  \#seq_input_from_feature_columns_concat.txt
+                                                         1161984*4*batch,\ #seq_input_from_feature_columns_concat_2.txt
+                                                         178*4*batch,    \#input_from_feature_columns_concat_7.txt
+                                                         425*8*batch]),  \#Unique_preprocess_int64.txt
+                                         (c_int32*1)(*[425*4*batch]))   #output
+            
+            
+            self.h.model_data(self._ctx, (c_int32 * 18)(*[425*4*batch, \# LookupPkOP
+                                                          8*4*batch,     \ # embedding_ui_oage_shared_embedding.txt
+                                                          1*4*batch,      \ # input_from_feature_columns_concat_3.txt
+                                                          592*4*batch,    \ # input_from_feature_columns_concat.txt
+                                                          1*4*batch,      \ #input_from_feature_columns_concat_5.txt
+                                                          512*4*batch,    \ #all_clk_seq_1_time.txt
+                                                          512*4*batch,    \#all_clk_seq_1_st.txt
+                                                          73728*4*batch,  \ #seq_input_from_feature_columns_concat_1.txt
+                                                          425*4*batch,    \ #batch_fill_attributes_for_gul_rank_item_feature
+                                                          425*4*batch，   \ #batch_fill_attributes_for_gul_rank_item_feature_1
+                                                          27200*4*batch,  \#embedding_item_id_d_shard_embedding_2.txt
+                                                          13600*4*batch,  \#embedding_item_cate_id_d_shared_embedding_2.txt
+                                                          13600*4*batch,  \#embedding_item_seller_id_d_shared_embedding_2.txt
+                                                          122400*4*batch, \#input_from_feature_columns_concat_4.txt
+                                                          350200*4*batch, \#input_from_feature_columns_concat_1.txt
+                                                         15552*4*batch,  \#seq_input_from_feature_columns_concat.txt
+                                                         1161984*4*batch,\ #seq_input_from_feature_columns_concat_2.txt
+                                                         178*4*batch]),    \#input_from_feature_columns_concat_7.txt
+                                                         
+                                         (c_int32*1)(*[425*4*batch]))   #output
+            '''        
         else:
             assert(False and f"unknown model.{model}")
     
         s = time()
-        # self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, c_void_p(0))
         self.h.odla_ExecuteComputation(self.comp, self.ctx, 0, self.device)
         t = time()
         self.logger.info("Execution time:" + str(t - s) + " sec(s)")

From 349e3b45058646bf0c620191179286232eb5b145 Mon Sep 17 00:00:00 2001
From: Weiming Zhao <weiming.zhao@alibaba-inc.com>
Date: Sat, 27 Nov 2021 16:09:23 -0800
Subject: [PATCH 3/6] remove unneeded import

---
 python/halo/odla.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/halo/odla.py b/python/halo/odla.py
index d6acdc447..c4f5c7b8c 100644
--- a/python/halo/odla.py
+++ b/python/halo/odla.py
@@ -17,8 +17,6 @@
 from time import time
 import logging
 import os
-from pdb import set_trace
-import numpy as np
 
 class Device(Enum):
     CUDA = 1

From d47b46de7b5d6bad1a23b5bbbd343a6193a4a5c0 Mon Sep 17 00:00:00 2001
From: zh-wei <wujiang.wz@alibaba-inc.com>
Date: Thu, 2 Dec 2021 09:50:37 +0000
Subject: [PATCH 4/6] Complete Python api for AnalyzeModel.

---
 python/halo/halo.py      | 34 ++++++++++++++++++++++++++++---
 python/halo/inference.py | 22 +++++++-------------
 python/halo/odla.py      | 43 ++++++++++++++++++++++++++++++----------
 3 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/python/halo/halo.py b/python/halo/halo.py
index 7d55ba37d..a48d58621 100644
--- a/python/halo/halo.py
+++ b/python/halo/halo.py
@@ -62,6 +62,17 @@ class CXXCodeGenOpts(Structure):
         ("save_temps", c_bool),
     ]
 
+HALO_MODEL_INFO_MAX_OUTPUT_NR = 64
+HALO_VODLA_MAX_OUTPUT_RSC_EST = 2048
+class ModelInfo(Structure):
+    _fields_ = [
+        ("num_outputs", c_size_t),
+        ("output_buf_sizes", c_size_t*HALO_MODEL_INFO_MAX_OUTPUT_NR),
+        ("input_qps", c_int),
+        ("adaptive_bsz", c_int),
+        ("output_rsc_est", c_char*HALO_VODLA_MAX_OUTPUT_RSC_EST),
+    ]
+
 
 """
 int halo_Compile(halo::ModelFormat model_format, unsigned num_models,
@@ -91,6 +102,24 @@ class CXXCodeGenOpts(Structure):
     c_void_p,  # model_info
 ]
 
+Analyze.argtypes = [
+    c_int,  # model_format
+    c_uint,  # num_models
+    c_void_p,  # models
+    c_void_p,  # model_sizes
+    c_char_p,  # target
+    c_int,  # batch
+    c_uint,  # num_input_shapes
+    c_void_p,  # input_shapes
+    c_uint,  # num_inputs
+    c_void_p,  # inputs
+    c_uint,  # num_outputs
+    c_void_p,  # outputs
+    c_void_p,  # cg_opts
+    c_char_p,  # filename
+    c_void_p,  # model_info
+]
+
 
 def exec(args):
     proc = subprocess.run(args)
@@ -163,7 +192,7 @@ def CompileModel(model_file, input_shapes, output_names, batch, format):
     return [output_file, output_bin]
 
 
-def AnalyzeModel(model_file, input_shapes, batch, format):
+def AnalyzeModel(model_file, input_shapes, batch, format, model_info):
     output_file = ""
     odla_lib = cast(create_string_buffer(b""), c_char_p)
     opts = CXXCodeGenOpts()
@@ -215,10 +244,9 @@ def AnalyzeModel(model_file, input_shapes, batch, format):
         outputs,
         pointer(opts),
         output_filename,
-        0
+        pointer(model_info),
     )
 
-
 def CompileODLAModel(files, device, debug=False):
     cc_file = files[0]
     bin_file = files[1]
diff --git a/python/halo/inference.py b/python/halo/inference.py
index 508a95197..022457c43 100644
--- a/python/halo/inference.py
+++ b/python/halo/inference.py
@@ -32,6 +32,7 @@ def __init__(
         device,
         batch,
         format,
+        qps,
         debug,
         log_level,
     ):
@@ -54,33 +55,24 @@ def __init__(
         self.format = format
         self.device = device
         self.batch = batch
+        self.qps = qps
         self.model = None
         self.so_file = None
-        self.intermediate_files = []
-        self.save_temps = False
-        print(f"self.batch:{self.batch}")
 
     def __del__(self):
-        self.logger.info(str(self.intermediate_files))
-        for file in self.intermediate_files:
-            if not self.save_temps:
-                Path(file).unlink()
         del self.model
 
     def Initialize(self):
         self.logger.info(f"Begin initialization;{self.model_file}")
-        files = halo.CompileModel(
+        self.so_file = "/usr/local/lib/libvodla.so"
+        self.model = odla.ODLAModel(self.so_file)
+        self.model.Load(
             self.model_file,
             self.input_shapes,
             self.output_names,
-            self.batch,
             self.format,
-        )
-        # self.so_file = halo.CompileODLAModel(files, self.device, self.debug)
-        self.so_file = "/usr/local/lib/libvodla.so"
-        self.intermediate_files = [*files]
-        self.model = odla.ODLAModel(self.so_file,files)
-        self.model.Load(self.model_file)
+            self.batch,
+            self.qps)
         self.logger.info("Done initialization")
 
     def Run(self, data):
diff --git a/python/halo/odla.py b/python/halo/odla.py
index c4f5c7b8c..5ffb51633 100644
--- a/python/halo/odla.py
+++ b/python/halo/odla.py
@@ -17,6 +17,8 @@
 from time import time
 import logging
 import os
+from pathlib import Path
+from halo import halo
 
 class Device(Enum):
     CUDA = 1
@@ -33,23 +35,44 @@ class ValueType(Structure):
 
 
 class ODLAModel:
-    def __init__(self, so_file, files):
+    def __init__(self, so_file):
         self.logger = logging.getLogger(__name__)
         self.so_file = so_file
         self.h = None
-        self.buffers = []
-        self.files = files
-
-    def Load(self,model):
+        self.save_temps = False
+        self.intermediate_files = []
+
+    def __del__(self):
+        self.logger.info(str(self.intermediate_files))
+        for file in self.intermediate_files:
+            if not self.save_temps:
+                Path(file).unlink()
+        if self.h is not None:
+            self.h.odla_DestroyContext(self.ctx)
+            self.h.odla_DestroyComputation(self.comp)
+            self.h.odla_DestroyDevice(self.device)
+
+    def Load(self,model,input_shapes,output_names,format,batch,qps):
         if self.h is None:
             self.h = CDLL(self.so_file)
         self.comp = c_void_p(0)
         self.device = c_void_p(0)
-        self.h.odla_AllocateDevice(c_void_p(0), 0, pointer(self.device))
-        self.h.odla_CreateComputation(pointer(self.comp))
-        # TODO:
-        # use_sim = c_bool(True)
-        # self.h.odla_SetComputationItem(self.comp, 7, pointer(use_sim))
+        model_info = halo.ModelInfo()
+        model_info.input_qps = qps
+        model_info.adaptive_bsz = batch
+        rsc_est = c_void_p(0)
+        if qps>0 and batch==1:
+            halo.AnalyzeModel(model,[],1,format,model_info)
+            rsc_est = (c_char_p)(model_info.output_rsc_est)
+        self.h.odla_AllocateDevice(c_void_p(0), 0, pointer(self.device), rsc_est)
+        self.files = halo.CompileModel(
+            model,
+            input_shapes,
+            output_names,
+            model_info.adaptive_bsz,
+            format,    
+        )
+        self.intermediate_files = [*self.files]
         cc_file = str(self.files[0]).encode("utf-8")
         bin_file = str(self.files[1]).encode("utf-8")
 

From ed3a1c12de5c1fdf90d6ccd98283060b1a1e53aa Mon Sep 17 00:00:00 2001
From: zh-wei <wujiang.wz@alibaba-inc.com>
Date: Mon, 6 Dec 2021 09:42:56 +0000
Subject: [PATCH 5/6] [Analyzer][Bugfix] halo.CompileModel() should use user's
 batchsize instead of adaptive_bsz.

---
 python/halo/odla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/halo/odla.py b/python/halo/odla.py
index 5ffb51633..e0e4b2aba 100644
--- a/python/halo/odla.py
+++ b/python/halo/odla.py
@@ -69,7 +69,7 @@ def Load(self,model,input_shapes,output_names,format,batch,qps):
             model,
             input_shapes,
             output_names,
-            model_info.adaptive_bsz,
+            batch,
             format,    
         )
         self.intermediate_files = [*self.files]

From 909c7f02e1d02575d7cf31c631a717ac8b5f9ea2 Mon Sep 17 00:00:00 2001
From: zh-wei <wujiang.wz@alibaba-inc.com>
Date: Fri, 24 Dec 2021 02:35:35 +0000
Subject: [PATCH 6/6] [AnalyzeModel] Analyzer should make use of 'input_shapes'
 for computation estimating.

---
 python/halo/odla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/halo/odla.py b/python/halo/odla.py
index e0e4b2aba..394c85d02 100644
--- a/python/halo/odla.py
+++ b/python/halo/odla.py
@@ -62,7 +62,7 @@ def Load(self,model,input_shapes,output_names,format,batch,qps):
         model_info.adaptive_bsz = batch
         rsc_est = c_void_p(0)
         if qps>0 and batch==1:
-            halo.AnalyzeModel(model,[],1,format,model_info)
+            halo.AnalyzeModel(model,input_shapes,1,format,model_info)
             rsc_est = (c_char_p)(model_info.output_rsc_est)
         self.h.odla_AllocateDevice(c_void_p(0), 0, pointer(self.device), rsc_est)
         self.files = halo.CompileModel(