From 4c3da0df24f7b22b4891a934299a52f431fcce6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=84=E5=AE=87=E6=89=AC?= <huang.yuyang@think-force.com>
Date: Wed, 17 Jul 2024 18:14:05 +0800
Subject: [PATCH] =?UTF-8?q?python=E6=8E=A5=E5=8F=A3=E5=A2=9E=E5=8A=A0?=
 =?UTF-8?q?=E6=98=BE=E5=AD=98=E6=8E=A7=E5=88=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/fastllm_pytools/llm.py    | 38 +++++++++++++++++++++++++++++++--
 tools/fastllm_pytools/server.py |  1 +
 tools/fastllm_pytools/util.py   |  6 ++++++
 tools/src/pytools.cpp           | 20 +++++++++++++++++
 4 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/tools/fastllm_pytools/llm.py b/tools/fastllm_pytools/llm.py
index a2267dbd..22dcac7b 100644
--- a/tools/fastllm_pytools/llm.py
+++ b/tools/fastllm_pytools/llm.py
@@ -76,6 +76,15 @@
 fastllm_lib.apply_chat_template.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
 fastllm_lib.apply_chat_template.restype = ctypes.c_char_p
 
+fastllm_lib.set_kv_cache_limit_llm_model.argtypes = [ctypes.c_int, ctypes.c_int64]
+
+fastllm_lib.set_max_batch_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
+
+fastllm_lib.set_verbose_llm_model.argtypes = [ctypes.c_int, ctypes.c_bool]
+
+fastllm_lib.get_max_input_len_llm_model.argtypes = [ctypes.c_int]
+fastllm_lib.get_max_input_len_llm_model.restype = ctypes.c_int
+
 def set_cpu_threads(threads: int):
     fastllm_lib.set_cpu_threads(threads);
 
@@ -841,7 +850,32 @@ def release_memory(self):
         fastllm_lib.release_memory(self.model)
     
     def set_save_history(self, save: bool):
-        fastllm_lib.set_save_history(self.model, save);
+        fastllm_lib.set_save_history(self.model, save)
 
     def set_atype(self, atype: str):
-        fastllm_lib.set_model_atype(self.model, str(atype).encode());
\ No newline at end of file
+        fastllm_lib.set_model_atype(self.model, str(atype).encode())
+
+    def set_kv_cache_limit(self, limit: str):
+        limit_bytes = 0
+        try:
+            if (limit.endswith('k') or limit.endswith('K')):
+                limit_bytes = int(limit[:-1]) * 1024
+            elif (limit.endswith('m') or limit.endswith('M')):
+                limit_bytes = int(limit[:-1]) * 1024 * 1024
+            elif (limit.endswith('g') or limit.endswith('G')):
+                limit_bytes = int(limit[:-1]) * 1024 * 1024 * 1024
+            else:
+                limit_bytes = int(limit[:-1])
+        except:
+            print('set_kv_cache_limit error, param should be like "10k" or "10m" or "1g"')
+            exit(0)
+        fastllm_lib.set_kv_cache_limit_llm_model(self.model, ctypes.c_int64(limit_bytes))
+    
+    def set_max_batch(self, batch: int):
+        fastllm_lib.set_max_batch_llm_model(self.model, batch)
+    
+    def set_verbose(self, verbose: int):
+        fastllm_lib.set_verbose_llm_model(self.model, verbose)
+    
+    def get_max_input_len(self):
+        return fastllm_lib.get_max_input_len_llm_model(self.model)
\ No newline at end of file
diff --git a/tools/fastllm_pytools/server.py b/tools/fastllm_pytools/server.py
index a644d9c3..82b9225f 100644
--- a/tools/fastllm_pytools/server.py
+++ b/tools/fastllm_pytools/server.py
@@ -62,6 +62,7 @@ def init_logging(log_level = logging.INFO, log_file:str = None):
     args = parse_args()
     logging.info(args)
     model = make_normal_llm_model(args)
+    model.set_verbose(True)
     fastllm_completion = FastLLmCompletion(model_name = args.model_name,
                                            model = model)
     uvicorn.run(app, host = args.host, port = args.port)
diff --git a/tools/fastllm_pytools/util.py b/tools/fastllm_pytools/util.py
index 6c5ab78a..081306f5 100644
--- a/tools/fastllm_pytools/util.py
+++ b/tools/fastllm_pytools/util.py
@@ -9,6 +9,8 @@ def make_normal_parser(des: str) -> argparse.ArgumentParser:
     parser.add_argument('--dtype', type = str, default = "float16", help = '权重类型（读取HF模型时有效）')
     parser.add_argument('--atype', type = str, default = "float32", help = '推理类型，可使用float32或float16')
     parser.add_argument('--cuda_embedding', action = 'store_true', help = '在cuda上进行embedding')
+    parser.add_argument('--kv_cache_limit', type = str, default = "auto",  help = 'kv缓存最大使用量')
+    parser.add_argument('--max_batch', type = int, default = -1,  help = '每次最多同时推理的询问数量')
     parser.add_argument('--device', type = str, help = '使用的设备')
     return parser
 
@@ -29,4 +31,8 @@ def make_normal_llm_model(args):
         llm.set_cuda_embedding(True)
     model = llm.model(args.path, dtype = args.dtype, tokenizer_type = "auto")
     model.set_atype(args.atype)
+    if (args.max_batch > 0):
+        model.set_max_batch(args.max_batch)
+    if (args.kv_cache_limit != "" and args.kv_cache_limit != "auto"):
+        model.set_kv_cache_limit(args.kv_cache_limit)
     return model
\ No newline at end of file
diff --git a/tools/src/pytools.cpp b/tools/src/pytools.cpp
index ed5486c0..53cf3461 100644
--- a/tools/src/pytools.cpp
+++ b/tools/src/pytools.cpp
@@ -391,4 +391,24 @@ extern "C" {
         auto model = models.GetModel(modelId);
         model->AddPromptCache(input);
     }
+
+    DLL_EXPORT void set_kv_cache_limit_llm_model(int modelId, long long bytes) {
+        auto model = models.GetModel(modelId);
+        model->kvCacheLimit = bytes;
+    }
+
+    DLL_EXPORT void set_max_batch_llm_model(int modelId, int batch) {
+        auto model = models.GetModel(modelId);
+        model->maxBatch = batch;
+    }
+
+    DLL_EXPORT void set_verbose_llm_model(int modelId, bool verbose) {
+        auto model = models.GetModel(modelId);
+        model->verbose = verbose;
+    }
+
+    DLL_EXPORT int get_max_input_len_llm_model(int modelId) {
+        auto model = models.GetModel(modelId);
+        return model->max_positions;
+    }
 };