From de88d050384c5b5abb267090e8f5a535aba44764 Mon Sep 17 00:00:00 2001 From: cgli Date: Wed, 3 Jul 2024 18:37:04 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8DWindows=E7=BC=96=E8=AF=91?= =?UTF-8?q?=E5=8F=8AHF=E6=A8=A1=E5=9E=8B=E7=9B=B4=E6=8E=A5=E8=AF=BB?= =?UTF-8?q?=E5=8F=96=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- example/Win32Demo/fastllm-gpu.vcxproj | 4 ++++ example/Win32Demo/fastllm-gpu.vcxproj.filters | 12 ++++++++++++ example/Win32Demo/fastllm.vcxproj | 4 ++++ example/Win32Demo/fastllm.vcxproj.filters | 12 ++++++++++++ src/model.cpp | 4 ++-- 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/example/Win32Demo/fastllm-gpu.vcxproj b/example/Win32Demo/fastllm-gpu.vcxproj index ee8c0c0b..46ebf855 100644 --- a/example/Win32Demo/fastllm-gpu.vcxproj +++ b/example/Win32Demo/fastllm-gpu.vcxproj @@ -201,6 +201,7 @@ + @@ -208,6 +209,7 @@ + @@ -227,12 +229,14 @@ + + diff --git a/example/Win32Demo/fastllm-gpu.vcxproj.filters b/example/Win32Demo/fastllm-gpu.vcxproj.filters index f8326fbf..8b8a5a10 100644 --- a/example/Win32Demo/fastllm-gpu.vcxproj.filters +++ b/example/Win32Demo/fastllm-gpu.vcxproj.filters @@ -57,6 +57,9 @@ 头文件 + + 头文件 + 头文件 @@ -81,6 +84,9 @@ 头文件\models + + 头文件\models + 头文件\models @@ -134,6 +140,9 @@ 源文件 + + 源文件 + 源文件 @@ -155,6 +164,9 @@ 源文件\models + + 源文件\models + 源文件\models diff --git a/example/Win32Demo/fastllm.vcxproj b/example/Win32Demo/fastllm.vcxproj index d684c814..a1bd9596 100644 --- a/example/Win32Demo/fastllm.vcxproj +++ b/example/Win32Demo/fastllm.vcxproj @@ -177,6 +177,7 @@ + @@ -184,6 +185,7 @@ + @@ -201,12 +203,14 @@ + + diff --git a/example/Win32Demo/fastllm.vcxproj.filters b/example/Win32Demo/fastllm.vcxproj.filters index afe15976..94ea3a95 100644 --- a/example/Win32Demo/fastllm.vcxproj.filters +++ b/example/Win32Demo/fastllm.vcxproj.filters @@ -57,6 +57,9 @@ 头文件 + + 头文件 + 头文件 @@ -81,6 +84,9 @@ 头文件\models + + 头文件\models + 头文件\models @@ -128,6 +134,9 @@ 源文件 + + 源文件 + 源文件 @@ -149,6 +158,9 @@ 源文件\models + + 源文件\models + 源文件\models diff --git a/src/model.cpp b/src/model.cpp index f15f711f..ad68e4d9 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -262,9 +262,9 @@ namespace fastllm { ClearBuffer(); buffer = new uint8_t[len * unitSize]; - FILE *fi = fopen(this->fileName.c_str(), "r"); + FILE *fi = fopen(this->fileName.c_str(), "rb"); int ret; -#if defined(_WIN32) or defined(_WIN64) +#if defined(_WIN32) || defined(_WIN64) _fseeki64(fi, this->data_offsets[0], 0); #else fseek(fi, this->data_offsets[0], 0); From 2793c4054d2e4a303f28da8ec4f79d7ef084e9e3 Mon Sep 17 00:00:00 2001 From: cgli Date: Wed, 3 Jul 2024 20:44:05 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=94=AF=E6=8C=81=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E7=9B=B4=E6=8E=A5=E8=AF=BB=E5=8F=96safetrensors=E5=BE=97?= =?UTF-8?q?=E5=88=B0=E7=9A=84llama=E7=B1=BB=E6=A8=A1=E5=9E=8B=E4=B8=BAflm?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=EF=BC=8C=E5=B9=B6=E5=8A=A0=E8=BD=BD=E6=8E=A8?= =?UTF-8?q?=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/devices/cpu/alivethreadpool.h | 2 +- src/fastllm.cpp | 2 ++ src/model.cpp | 7 ++++++- src/models/llama.cpp | 6 +++++- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/devices/cpu/alivethreadpool.h b/include/devices/cpu/alivethreadpool.h index f64132fc..2c963961 100644 --- a/include/devices/cpu/alivethreadpool.h +++ b/include/devices/cpu/alivethreadpool.h @@ -62,7 +62,7 @@ namespace fastllm { auto duration = std::chrono::duration_cast (std::chrono::system_clock::now() - lastRunTime); double gap = double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den; if (gap > 3) { - std::this_thread::sleep_for(std::chrono::seconds(0)); + std::this_thread::sleep_for(std::chrono::microseconds(2)); } } } diff --git a/src/fastllm.cpp b/src/fastllm.cpp index 5b2c6a27..eab1d29e 100644 --- a/src/fastllm.cpp +++ b/src/fastllm.cpp @@ -1976,6 +1976,8 @@ namespace fastllm { } tokenizer.SetSpecialTokens(specialTokens); } + if (this->dicts.find("chat_template") != this->dicts.end()) + tokenizer.chatTemplate = this->dicts["chat_template"]; int len = buffer.ReadInt(); for (int i = 0; i < len; i++) { diff --git a/src/model.cpp b/src/model.cpp index ad68e4d9..8fbc5ef9 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -424,6 +424,8 @@ namespace fastllm { std::string tokenizerConfigFile = path + "tokenizer_config.json"; auto tokenizerConfig = json11::Json::parse(ReadAllFile(tokenizerConfigFile), error); model->weight.tokenizer.SetTokenizerConfig(tokenizerConfig); + if (!model->weight.tokenizer.chatTemplate.empty() && model->weight.dicts.find("chat_template") == model->weight.dicts.end()) + model->weight.AddDict("chat_template", model->weight.tokenizer.chatTemplate); std::string tokenizerClass = tokenizerConfig["tokenizer_class"].string_value(); if (tokenizerClass == "PreTrainedTokenizerFast" || tokenizerClass == "Qwen2Tokenizer" @@ -439,10 +441,13 @@ namespace fastllm { spTokens[it["content"].string_value()] = it["id"].int_value(); } model->weight.tokenizer.SetSpecialTokens(spTokens); + if (!spTokens.empty()) + model->weight.AddDict("tokenizer_has_special_tokens", "1"); if (!tokenizer["decoder"].is_null() && !tokenizer["decoder"]["type"].is_null() && tokenizer["decoder"]["type"].string_value() == "ByteLevel") { model->weight.tokenizer.byteAsChar = true; + model->weight.AddDict("tokenizer_byte_as_char", "True"); } } else if (tokenizerClass == "ChatGLM4Tokenizer") { // GLM4御用的分词 @@ -515,7 +520,7 @@ namespace fastllm { auto config = json11::Json::parse(ReadAllFile(configFile), error); basellm *model = CreateModelWithType(config["model_type"].string_value()); for (auto &it : config.object_items()) { - model->weight.AddDict(it.first, it.second.dump().c_str()); + model->weight.AddDict(it.first, it.second.is_string() ? it.second.string_value() : it.second.dump()); } // 设置eos_token_id if (config["eos_token_id"].is_array()) { diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 6e2724ce..18c7cf26 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -151,7 +151,7 @@ namespace fastllm { std::string mergeQkvWeightName = "model.layers." + std::to_string(i) + ".self_attn.mergeqkv.weight"; std::string mergeQkvBiasName = "model.layers." + std::to_string(i) + ".self_attn.mergeqkv.bias"; - if (weight.weight.find(qkvWeightName) != weight.weight.end()) { + if (weight.weight.find(qkvWeightName) != weight.weight.end() || weight.weight.find(mergeQkvWeightName) != weight.weight.end()) { mergeQKV = true; break; } else { @@ -214,6 +214,10 @@ namespace fastllm { std::string w3WeightName = "model.layers." + std::to_string(i) + ".mlp.up_proj.weight"; std::string swigluWeightName = "model.layers." + std::to_string(i) + ".mlp.gateup_proj.weight"; + if (weight.weight.find(swigluWeightName) != weight.weight.end()) { + mergeQKV = true; + break; + } Data &w1 = weight.weight[w1WeightName], &w3 = weight.weight[w3WeightName]; if ((w1.dataType == DataType::INT4_GROUP && w1.dims[1] % w1.groupCnt != 0) || (w3.dataType == DataType::INT4_GROUP && w3.dims[1] % w3.groupCnt != 0)) { From e787aa64b7ff7824b5db8e30e760d1f83da259c1 Mon Sep 17 00:00:00 2001 From: cgli Date: Sun, 7 Jul 2024 15:31:43 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E4=BF=9D=E5=AD=98=E7=9B=B4=E6=8E=A5?= =?UTF-8?q?=E8=AF=BB=E5=8F=96=E7=9A=84glm4=E7=B1=BB=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E4=B8=BAflm=E6=A0=BC=E5=BC=8F(#465)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/model.cpp | 31 ++++++++++++++++++++++++------ src/models/chatglm.cpp | 8 ++++++++ tools/fastllm_pytools/hf_model.py | 4 ++-- tools/fastllm_pytools/torch2flm.py | 4 ++-- 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/model.cpp b/src/model.cpp index 8fbc5ef9..2f1e55a6 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -66,11 +66,21 @@ namespace fastllm { void basellm::InitParams() { if (this->weight.dicts.find("bos_token_id") != this->weight.dicts.end()) { - if(this->weight.dicts["bos_token_id"]!="None"){ + if (this->weight.dicts["bos_token_id"]!="None") { this->bos_token_id = atoi(this->weight.dicts["bos_token_id"].c_str()); } - if(this->weight.dicts["eos_token_id"]!="None"){ - this->eos_token_id = atoi(this->weight.dicts["eos_token_id"].c_str()); + } + if (this->weight.dicts.find("eos_token_id") != this->weight.dicts.end()) { + if (this->weight.dicts["eos_token_id"]!="None") { + if (this->weight.dicts["eos_token_id"][0] == '[' && this->eos_token_ids.empty()) { + std::string error; + json11::Json ids = json11::Json::parse(this->weight.dicts["eos_token_id"], error); + for (auto &it : ids.array_items()) { + this->eos_token_ids.insert(it.int_value()); + } + } else { + this->eos_token_id = atoi(this->weight.dicts["eos_token_id"].c_str()); + } } } if (this->weight.dicts.find("im_start_id") != this->weight.dicts.end()) { @@ -127,6 +137,16 @@ namespace fastllm { } void basellm::SaveModel(const std::string &fileName) { + if (this->weight.tokenizer.chatTemplate.empty()) { + if (this->weight.dicts.find("pre_prompt") == this->weight.dicts.end()) + this->weight.dicts["pre_prompt"] = pre_prompt; + if (this->weight.dicts.find("user_role") == this->weight.dicts.end()) + this->weight.dicts["user_role"] = user_role; + if (this->weight.dicts.find("bot_role") == this->weight.dicts.end()) + this->weight.dicts["bot_role"] = bot_role; + if (this->weight.dicts.find("history_sep") == this->weight.dicts.end()) + this->weight.dicts["history_sep"] = history_sep; + } this->weight.SaveLowBitModel(fileName, 0); } @@ -451,7 +471,6 @@ namespace fastllm { } } else if (tokenizerClass == "ChatGLM4Tokenizer") { // GLM4御用的分词 - model->bot_role = " "; std::vector lines, line; SplitString(ReadAllFile(path + "tokenizer.model"), {'\r', '\n'}, lines); for (int i = 0; i < lines.size(); i++) { @@ -463,8 +482,8 @@ namespace fastllm { spTokens[it.second["content"].string_value()] = atoi(it.first.c_str()); } model->weight.tokenizer.SetSpecialTokens(spTokens); - ((ChatGLMModel*)model)->gmask_token_id = model->weight.tokenizer.GetTokenId("[gMASK]"); - ((ChatGLMModel*)model)->bos_token_id = model->weight.tokenizer.GetTokenId(""); + model->weight.AddDict("tokenizer_has_special_tokens", "1"); + model->weight.AddDict("tokenizer_class", tokenizerClass); ((ChatGLMModel*)model)->tokenizerClass = tokenizerClass; // ChatGLM采用拼接token的方法,需要强行指定分割词的TokenID diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index 7f786b3e..a6e0e8b1 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -80,6 +80,9 @@ namespace fastllm { void ChatGLMModel::InitParams() { basellm::InitParams(); + if (this->weight.dicts.find("tokenizer_class") != this->weight.dicts.end()) { + this->tokenizerClass = this->weight.dicts["tokenizer_class"]; + } if (GetVersion() == 1) { if (this->weight.dicts.find("gmask_token_id") != this->weight.dicts.end()) { this->gmask_token_id = atoi(this->weight.dicts["gmask_token_id"].c_str()); @@ -97,6 +100,11 @@ namespace fastllm { if (this->weight.dicts.find("rope_ratio") != this->weight.dicts.end()) { UpdateRotaryPosEmb(atof(this->weight.dicts["rope_ratio"].c_str())); } + if (this->tokenizerClass == "ChatGLM4Tokenizer") { + this->gmask_token_id = this->weight.tokenizer.GetTokenId("[gMASK]"); + this->bos_token_id = this->weight.tokenizer.GetTokenId(""); + this->weight.tokenizer.type = Tokenizer::TokenizerType::QWEN; + } } int ChatGLMModel::Forward(const fastllm::Data &inputIds, const fastllm::Data &attentionMask, diff --git a/tools/fastllm_pytools/hf_model.py b/tools/fastllm_pytools/hf_model.py index 3fc2b317..37bbcfb8 100644 --- a/tools/fastllm_pytools/hf_model.py +++ b/tools/fastllm_pytools/hf_model.py @@ -94,11 +94,11 @@ def create(model, modelInfo["history_sep"] = ""; if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "name") and tokenizer.name == "GLM4Tokenizer"): # glm-4-chat - modelInfo["pre_prompt"] = "[gMASK]"; + modelInfo["pre_prompt"] = ""; modelInfo["user_role"] = ("")) + ">\n"); modelInfo["bot_role"] = ("")) + ">"); modelInfo["history_sep"] = ""; - modelInfo["eos_token_id"] = "151336" + modelInfo["tokenizer_class"] = tokenizer.name; if "rope_scaling" in modelInfo and isinstance(modelInfo["rope_scaling"], builtins.dict): rope_scaling = modelInfo.pop("rope_scaling") modelInfo["rope_scaling.type"] = rope_scaling["type"] diff --git a/tools/fastllm_pytools/torch2flm.py b/tools/fastllm_pytools/torch2flm.py index 6ef80111..523fe6b5 100644 --- a/tools/fastllm_pytools/torch2flm.py +++ b/tools/fastllm_pytools/torch2flm.py @@ -179,11 +179,11 @@ def tofile(exportPath, modelInfo["history_sep"] = ""; if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "name") and tokenizer.name == "GLM4Tokenizer"): # glm-4-chat - modelInfo["pre_prompt"] = "[gMASK]"; + modelInfo["pre_prompt"] = ""; modelInfo["user_role"] = ("")) + ">\n"); modelInfo["bot_role"] = ("")) + ">"); modelInfo["history_sep"] = ""; - modelInfo["eos_token_id"] = "151336" + modelInfo["tokenizer_class"] = tokenizer.name; if "rope_scaling" in modelInfo and isinstance(modelInfo["rope_scaling"], builtins.dict): rope_scaling = modelInfo.pop("rope_scaling") modelInfo["rope_scaling.type"] = rope_scaling["type"]