diff --git a/include/fastllm.h b/include/fastllm.h index a756543b..a1c8b2af 100644 --- a/include/fastllm.h +++ b/include/fastllm.h @@ -183,7 +183,7 @@ namespace fastllm { }; enum WeightType { - NONE = 0, LINEAR = 1, EMBEDDING = 2 + NONE = 0, LINEAR = 1, EMBEDDING = 2, AUTO = 99999 }; struct FileMmap { @@ -439,6 +439,8 @@ namespace fastllm { std::set embeddingNames; + std::set linearNames; + void LoadFromFile(const std::string &fileName); // 从文件读取 void SaveLowBitModel(const std::string &fileName, int bit); // 存储成量化模型, bit = 0代表直接存 @@ -458,6 +460,8 @@ namespace fastllm { void AddQLinearWeight(const std::string &key, const std::vector &dims, int bit, float *scales, uint8_t *oriData); // 插入一个Qlinear层的权重,量化规则为float value = scales * oriData + WeightType GetWeightType(const std::string &key); // 获取某个权重的类型(若未判断出来,则为None) + Data &operator [] (const std::string &key); }; diff --git a/include/model.h b/include/model.h index e80c0578..a85c8d8c 100644 --- a/include/model.h +++ b/include/model.h @@ -14,6 +14,10 @@ namespace fastllm { std::unique_ptr CreateLLMModelFromFile(const std::string &fileName); std::unique_ptr CreateEmptyLLMModel(const std::string &modelType); + + std::unique_ptr CreateLLMModelFromHF(const std::string &modelPath, + DataType linearDataType, + int groupCnt = -1); } #endif //FASTLLM_MODEL_H diff --git a/src/fastllm.cpp b/src/fastllm.cpp index 6caa533a..a7f81f2c 100644 --- a/src/fastllm.cpp +++ b/src/fastllm.cpp @@ -1986,6 +1986,40 @@ namespace fastllm { this->peftDict[name][key] = value; } + WeightType WeightMap::GetWeightType(const std::string &key) { + if (this->embeddingNames.find(key) != this->embeddingNames.end()) { + return WeightType::EMBEDDING; + } + for (auto &linearName : this->linearNames) { + int n = key.size(), m = linearName.size(); + std::vector > f = std::vector > (n + 1, std::vector (m + 1, 0)); + f[0][0] = 1; + for (int i = 0; i <= n; i++) { + for (int j = 0; j <= m; j++) { + if (f[i][j]) { + if (i + 1 <= n && key[i] == '*') { + for (int l = j; l <= m; l++) { + f[i + 1][l] = 1; + } + } + if (j + 1 <= m && linearName[j] == '*') { + for (int l = i; l <= n; l++) { + f[l][j + 1] = 1; + } + } + if (i + 1 <= n && j + 1 <= m && key[i] == linearName[j]) { + f[i + 1][j + 1] = 1; + } + } + } + } + if (f[n][m]) { + return WeightType::LINEAR; + } + } + return WeightType::NONE; + } + void WeightMap::AddQLinearWeight(const std::string &key, const std::vector &dims, int bit, float *scales, uint8_t *oriData) { AssertInFastLLM(bit == 4 || bit == 8, "Error: only support 8 bit or 4 bit QLinear.\n"); @@ -2041,6 +2075,16 @@ namespace fastllm { void WeightMap::AddWeight(const std::string &key, const std::vector &dims, fastllm::DataType dataType, fastllm::WeightType weightType, fastllm::DataType oriDataType, uint8_t *oriData, int groupCnt) { + if (weightType == WeightType::AUTO) { + weightType = GetWeightType(key); + if (weightType == WeightType::EMBEDDING) { + dataType = oriDataType; + } + if (weightType == WeightType::NONE) { + dataType = oriDataType; + } + } + this->weight[key] = Data(dataType, dims); this->weight[key].name = std::string(key); Data &data = this->weight[key]; diff --git a/src/model.cpp b/src/model.cpp index 91ef5f03..655ea1de 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -1,8 +1,10 @@ #include "utils.h" +#include "json11.hpp" #include "model.h" #include "fastllm.h" #include +#include #include "chatglm.h" #include "moss.h" @@ -16,6 +18,40 @@ #include "bert.h" namespace fastllm { + std::string ReadAllFile(const std::string &fileName) { + if (access(fileName.c_str(), R_OK) != 0) { + ErrorInFastLLM("Read error: can't find \"" + fileName + "\"."); + } + + std::ifstream t(fileName.c_str()); + std::string ret((std::istreambuf_iterator(t)), std::istreambuf_iterator()); + t.close(); + return ret; + } + + void ConvertDataType(uint8_t *src, DataType srcDtype, uint8_t *dst, DataType dstDtype, uint64_t len) { + if (srcDtype == dstDtype) { + int unitSize = 4; + if (dstDtype == DataType::FLOAT32) { + unitSize = 4; + } else if (dstDtype == DataType::FLOAT16 || dstDtype == DataType::BFLOAT16) { + unitSize = 2; + } else { + ErrorInFastLLM("ConvertDataType Failed. (" + std::to_string(srcDtype) + " -> " + std::to_string(dstDtype) + ")"); + } + memcpy(dst, src, len * unitSize); + } else if (srcDtype == DataType::BFLOAT16 && dstDtype == DataType::FLOAT32) { + uint16_t *u16dst = (uint16_t*)dst; + uint16_t *u16src = (uint16_t*)src; + for (int i = 0; i < len; i++) { + u16dst[i * 2] = 0; + u16dst[i * 2 + 1] = u16src[i]; + } + } else { + ErrorInFastLLM("ConvertDataType Failed. (" + std::to_string(srcDtype) + " -> " + std::to_string(dstDtype) + ")"); + } + } + void basellm::LoadFromFile(const std::string &fileName) { this->weight.LoadFromFile(fileName); this->InitParams(); @@ -153,4 +189,191 @@ namespace fastllm { basellm *model = CreateModelWithType(modelType); return std::unique_ptr (model); } + + struct SafeTensorItem { + std::string tensorName; + std::string fileName; + std::string dtype; + std::vector shape; + std::vector intShape; + std::vector data_offsets; + + uint64_t len, bytes; + uint8_t *buffer = nullptr; + + SafeTensorItem() {} + + SafeTensorItem(const std::string &tensorName, const std::string &fileName, const json11::Json &config, uint64_t baseOffset) { + this->tensorName = tensorName; + this->fileName = fileName; + + this->dtype = config["dtype"].string_value(); + for (auto &it : config["data_offsets"].array_items()) { + this->data_offsets.push_back(baseOffset + it.ll_value()); + } + for (auto &it : config["shape"].array_items()) { + this->shape.push_back(it.ll_value()); + this->intShape.push_back(this->shape.back()); + } + + len = 1; + for (auto &it : shape) { + len *= it; + } + bytes = this->data_offsets[1] - this->data_offsets[0]; + } + + void CreateBuffer(DataType dstType) { + DataType srcType; + if (this->dtype == "BF16") { + srcType = DataType::BFLOAT16; + } else { + ErrorInFastLLM("SafeTensorItem.CreateBuffer: unsupport src dtype " + this->dtype + "\n"); + } + + int unitSize = 4; + if (dstType == DataType::FLOAT32) { + unitSize = 4; + } else if (dstType == DataType::FLOAT16 || dstType == DataType::BFLOAT16) { + unitSize = 2; + } else { + ErrorInFastLLM("SafeTensorItem.CreateBuffer: unsupport dst dtype " + std::to_string(dstType) + "\n"); + } + ClearBuffer(); + buffer = new uint8_t[len * unitSize]; + + FILE *fi = fopen(this->fileName.c_str(), "r"); + int ret; +#if defined(_WIN32) or defined(_WIN64) + _fseeki64(fi, this->data_offsets[0], 0); +#else + fseek(fi, this->data_offsets[0], 0); +#endif + if (dstType == srcType) { + ret = fread(buffer, 1, this->bytes, fi); + } else { + uint8_t *ori = new uint8_t[this->bytes]; + ret = fread(ori, 1, this->bytes, fi); + ConvertDataType(ori, srcType, buffer, dstType, len); + delete[] ori; + } + fclose(fi); + } + + void ClearBuffer() { + delete[] buffer; + buffer = nullptr; + } + }; + + struct SafeTensors { + std::set fileNames; + std::map itmeDict; + + SafeTensors (const std::set &fileNames) { + std::string error; + this->fileNames = fileNames; + for (auto &fileName : fileNames) { + FILE *f = fopen(fileName.c_str(), "rb"); + uint64_t configBytes; + int ret = fread(&configBytes, 8, 1, f); + char *configString = new char[configBytes + 5]; + ret = fread(configString, 1, configBytes, f); + configString[configBytes] = 0; + auto config = json11::Json::parse(configString, error); + for (auto it : config.object_items()) { + if (it.first != "__metadata__" ) { + itmeDict[it.first] = SafeTensorItem(it.first, fileName, it.second, 8 + configBytes); + } + } + + delete[] configString; + } + } + + std::vector GetSortedItemNames() { + std::vector , std::string> > v; + for (auto &it : itmeDict) { + v.push_back(std::make_pair(std::make_pair(it.second.fileName, it.second.data_offsets[0]), it.first)); + } + std::sort(v.begin(), v.end()); + std::vector ret; + for (int i = 0; i < v.size(); i++) { + ret.push_back(v[i].second); + } + return ret; + } + }; + + // 从hf文件夹读取,仅支持safetensor格式的模型 + std::unique_ptr CreateLLMModelFromHF(const std::string &modelPath, + DataType linearDataType, int groupCnt) { + std::string path = modelPath; + if (path.back() != '/' || path.back() != '\\') { + path += "/"; + } + + // 1. 检查是否有 model.safetensors.index.json,如果有就读取 + std::string stIndexFile = path + "model.safetensors.index.json"; + std::string error; + auto stIndex = json11::Json::parse(ReadAllFile(stIndexFile), error)["weight_map"]; + std::set stFiles; + for (auto it : stIndex.object_items()) { + stFiles.insert(path + it.second.string_value()); + } + SafeTensors safeTensors(stFiles); + + // 2. 创建网络基本信息 + std::string configFile = path + "config.json"; + auto config = json11::Json::parse(ReadAllFile(configFile), error); + basellm *model = CreateModelWithType(config["model_type"].string_value()); + for (auto &it : config.object_items()) { + model->weight.AddDict(it.first, it.second.dump().c_str()); + } + + // 3. 读取分词 + std::string tokenizerConfigFile = path + "tokenizer_config.json"; + auto tokenizerConfig = json11::Json::parse(ReadAllFile(tokenizerConfigFile), error); + std::string tokenizerClass = tokenizerConfig["tokenizer_class"].string_value(); + if (tokenizerClass == "PreTrainedTokenizerFast") { + // PreTrainedTokenizerFast + std::string tokenizerFile = path + "tokenizer.json"; + auto tokenizer = json11::Json::parse(ReadAllFile(tokenizerFile), error); + auto tokenizerModel = tokenizer["model"]; + auto vocab = tokenizerModel["vocab"]; + for (auto &it : vocab.object_items()) { + model->weight.AddTokenizerWord(it.first, it.second.int_value(), 1.0f); + } + std::map spTokens; + for (auto &it : tokenizer["added_tokens"].array_items()) { + spTokens[it["content"].string_value()] = it["id"].int_value(); + } + model->weight.tokenizer.SetSpecialTokens(spTokens); + + if (!tokenizer["decoder"].is_null() && !tokenizer["decoder"]["type"].is_null() && + tokenizer["decoder"]["type"].string_value() == "ByteLevel") { + model->weight.tokenizer.byteAsChar = true; + } + } else { + ErrorInFastLLM("Unsupport tokenizer_class: " + tokenizerClass); + } + + // 4. 读取权重 + int cur = 0; + for (auto &weightName : safeTensors.GetSortedItemNames()) { + auto &tensor = safeTensors.itmeDict[weightName]; + tensor.CreateBuffer(DataType::FLOAT32); + model->weight.AddWeight(weightName, tensor.intShape, linearDataType, WeightType::AUTO, DataType::FLOAT32, tensor.buffer, groupCnt); + tensor.ClearBuffer(); + + printf("Load (%d / %d) \r", (++cur), (int)safeTensors.itmeDict.size()); + fflush(stdout); + } + printf("\n"); + fflush(stdout); + + model->InitParams(); + model->WarmUp(); + return std::unique_ptr (model); + } } diff --git a/src/models/llama.cpp b/src/models/llama.cpp index a99e9222..e05b1a0e 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -46,16 +46,22 @@ namespace fastllm { LlamaModel::LlamaModel() { this->model_type = "llama"; - // 默认使用alpaca的提示词和instruction - this->pre_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"; - this->user_role = "### Instruction:\n"; - this->bot_role = "\n\n### Response:"; - this->history_sep = ""; + // 默认使用 llama3 的提示词和instruction + this->pre_prompt="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful assistant.<|eot_id|>"; + this->user_role="<|start_header_id|>user<|end_header_id|>\n"; + this->bot_role="<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"; + this->history_sep="<|eot_id|>\n"; block_cnt = 32; rotary_dim = 128; weight.embeddingNames.insert("model.embed_tokens.weight"); + weight.linearNames = { + "lm_head.weight", "model.layers.*.mlp.down_proj.weight", "model.layers.*.mlp.up_proj.weight", + "model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.gate_proj.weight", "model.layers.*.mlp.gateup_proj.weight", + "model.layers.*.self_attn.o_proj.weight", "model.layers.*.self_attn.q_proj.weight", "model.layers.*.self_attn.k_proj.weight", + "model.layers.*.self_attn.v_proj.weight", "model.layers.*.self_attn.mergeqkv.weight", "model.layers.*.self_attn.W_pack.weight" + }; } void LlamaModel::InitParams() {