From 4c3da0df24f7b22b4891a934299a52f431fcce6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E5=AE=87=E6=89=AC?= Date: Wed, 17 Jul 2024 18:14:05 +0800 Subject: [PATCH] =?UTF-8?q?python=E6=8E=A5=E5=8F=A3=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E6=98=BE=E5=AD=98=E6=8E=A7=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/fastllm_pytools/llm.py | 38 +++++++++++++++++++++++++++++++-- tools/fastllm_pytools/server.py | 1 + tools/fastllm_pytools/util.py | 6 ++++++ tools/src/pytools.cpp | 20 +++++++++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/tools/fastllm_pytools/llm.py b/tools/fastllm_pytools/llm.py index a2267dbd..22dcac7b 100644 --- a/tools/fastllm_pytools/llm.py +++ b/tools/fastllm_pytools/llm.py @@ -76,6 +76,15 @@ fastllm_lib.apply_chat_template.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] fastllm_lib.apply_chat_template.restype = ctypes.c_char_p +fastllm_lib.set_kv_cache_limit_llm_model.argtypes = [ctypes.c_int, ctypes.c_int64] + +fastllm_lib.set_max_batch_llm_model.argtypes = [ctypes.c_int, ctypes.c_int] + +fastllm_lib.set_verbose_llm_model.argtypes = [ctypes.c_int, ctypes.c_bool] + +fastllm_lib.get_max_input_len_llm_model.argtypes = [ctypes.c_int] +fastllm_lib.get_max_input_len_llm_model.restype = ctypes.c_int + def set_cpu_threads(threads: int): fastllm_lib.set_cpu_threads(threads); @@ -841,7 +850,32 @@ def release_memory(self): fastllm_lib.release_memory(self.model) def set_save_history(self, save: bool): - fastllm_lib.set_save_history(self.model, save); + fastllm_lib.set_save_history(self.model, save) def set_atype(self, atype: str): - fastllm_lib.set_model_atype(self.model, str(atype).encode()); \ No newline at end of file + fastllm_lib.set_model_atype(self.model, str(atype).encode()) + + def set_kv_cache_limit(self, limit: str): + limit_bytes = 0 + try: + if (limit.endswith('k') or limit.endswith('K')): + limit_bytes = int(limit[:-1]) * 1024 + elif (limit.endswith('m') or limit.endswith('M')): + limit_bytes = int(limit[:-1]) * 1024 * 1024 + elif (limit.endswith('g') or limit.endswith('G')): + limit_bytes = int(limit[:-1]) * 1024 * 1024 * 1024 + else: + limit_bytes = int(limit[:-1]) + except: + print('set_kv_cache_limit error, param should be like "10k" or "10m" or "1g"') + exit(0) + fastllm_lib.set_kv_cache_limit_llm_model(self.model, ctypes.c_int64(limit_bytes)) + + def set_max_batch(self, batch: int): + fastllm_lib.set_max_batch_llm_model(self.model, batch) + + def set_verbose(self, verbose: int): + fastllm_lib.set_verbose_llm_model(self.model, verbose) + + def get_max_input_len(self): + return fastllm_lib.get_max_input_len_llm_model(self.model) \ No newline at end of file diff --git a/tools/fastllm_pytools/server.py b/tools/fastllm_pytools/server.py index a644d9c3..82b9225f 100644 --- a/tools/fastllm_pytools/server.py +++ b/tools/fastllm_pytools/server.py @@ -62,6 +62,7 @@ def init_logging(log_level = logging.INFO, log_file:str = None): args = parse_args() logging.info(args) model = make_normal_llm_model(args) + model.set_verbose(True) fastllm_completion = FastLLmCompletion(model_name = args.model_name, model = model) uvicorn.run(app, host = args.host, port = args.port) diff --git a/tools/fastllm_pytools/util.py b/tools/fastllm_pytools/util.py index 6c5ab78a..081306f5 100644 --- a/tools/fastllm_pytools/util.py +++ b/tools/fastllm_pytools/util.py @@ -9,6 +9,8 @@ def make_normal_parser(des: str) -> argparse.ArgumentParser: parser.add_argument('--dtype', type = str, default = "float16", help = '权重类型(读取HF模型时有效)') parser.add_argument('--atype', type = str, default = "float32", help = '推理类型,可使用float32或float16') parser.add_argument('--cuda_embedding', action = 'store_true', help = '在cuda上进行embedding') + parser.add_argument('--kv_cache_limit', type = str, default = "auto", help = 'kv缓存最大使用量') + parser.add_argument('--max_batch', type = int, default = -1, help = '每次最多同时推理的询问数量') parser.add_argument('--device', type = str, help = '使用的设备') return parser @@ -29,4 +31,8 @@ def make_normal_llm_model(args): llm.set_cuda_embedding(True) model = llm.model(args.path, dtype = args.dtype, tokenizer_type = "auto") model.set_atype(args.atype) + if (args.max_batch > 0): + model.set_max_batch(args.max_batch) + if (args.kv_cache_limit != "" and args.kv_cache_limit != "auto"): + model.set_kv_cache_limit(args.kv_cache_limit) return model \ No newline at end of file diff --git a/tools/src/pytools.cpp b/tools/src/pytools.cpp index ed5486c0..53cf3461 100644 --- a/tools/src/pytools.cpp +++ b/tools/src/pytools.cpp @@ -391,4 +391,24 @@ extern "C" { auto model = models.GetModel(modelId); model->AddPromptCache(input); } + + DLL_EXPORT void set_kv_cache_limit_llm_model(int modelId, long long bytes) { + auto model = models.GetModel(modelId); + model->kvCacheLimit = bytes; + } + + DLL_EXPORT void set_max_batch_llm_model(int modelId, int batch) { + auto model = models.GetModel(modelId); + model->maxBatch = batch; + } + + DLL_EXPORT void set_verbose_llm_model(int modelId, bool verbose) { + auto model = models.GetModel(modelId); + model->verbose = verbose; + } + + DLL_EXPORT int get_max_input_len_llm_model(int modelId) { + auto model = models.GetModel(modelId); + return model->max_positions; + } };