diff --git a/include/models/bert.h b/include/models/bert.h index 00b66aa..ef70ecd 100644 --- a/include/models/bert.h +++ b/include/models/bert.h @@ -33,7 +33,11 @@ namespace fastllm { const GenerationConfig &generationConfig = GenerationConfig(), const LastTokensManager &lastTokens = LastTokensManager(), std::vector *logits = nullptr); - + + std::vector EmbeddingSentence(const std::vector &tokens, bool normalize); + + std::vector > EmbeddingSentenceBatch(const std::vector > &tokens, bool normalize); + std::vector EmbeddingSentence(const std::string &context, bool normalize); std::vector > EmbeddingSentenceBatch(const std::vector &contexts, bool normalize); @@ -56,7 +60,6 @@ namespace fastllm { int max_positions = 32768; int block_cnt = 12; - WeightMap weight; // 权重 std::map deviceMap; }; } diff --git a/src/models/bert.cpp b/src/models/bert.cpp index 9573cac..4518633 100644 --- a/src/models/bert.cpp +++ b/src/models/bert.cpp @@ -135,6 +135,40 @@ namespace fastllm { return ret; } + std::vector BertModel::EmbeddingSentence(const std::vector &tokens, bool normalize) { + std::vector > tokenss; + tokenss.push_back(tokens); + return EmbeddingSentenceBatch(tokenss, normalize)[0]; + } + + std::vector > BertModel::EmbeddingSentenceBatch(const std::vector > &tokens, bool normalize) { + int batch = tokens.size(), len = 0; + for (int i = 0; i < batch; i++) { + len = std::max(len, (int)tokens[i].size()); + } + + std::vector ids = std::vector (batch * len, 0.0f); + std::vector seqLens = std::vector (batch, 0.0f); + std::vector token_type_ids = std::vector (batch * len, 0.0f); + std::vector attention_mask = std::vector (batch * len, -1e10f); + std::vector position_ids = std::vector (batch * len, 0.0f); + for (int i = 0; i < batch; i++) { + seqLens[i] = tokens[i].size(); + for (int j = 0; j < tokens[i].size(); j++) { + ids[i * len + j] = tokens[i][j]; + attention_mask[i * len + j] = 0; + position_ids[i * len + j] = j; + } + } + + fastllm::Data inputIds = fastllm::Data(fastllm::DataType::FLOAT32, {batch, len}, ids); + fastllm::Data attentionMask = fastllm::Data(fastllm::DataType::FLOAT32, {batch, len}, attention_mask); + fastllm::Data tokenTypeIds = fastllm::Data(fastllm::DataType::FLOAT32, {batch, len}, token_type_ids); + fastllm::Data positionIds = fastllm::Data(fastllm::DataType::FLOAT32, {batch, len}, position_ids); + + return ForwardAll(inputIds, attentionMask, tokenTypeIds, positionIds, normalize); + } + std::vector BertModel::EmbeddingSentence(const std::string &context, bool normalize) { std::vector contexts; contexts.push_back(context); diff --git a/tools/fastllm_pytools/llm.py b/tools/fastllm_pytools/llm.py index d1b8a68..bd76725 100644 --- a/tools/fastllm_pytools/llm.py +++ b/tools/fastllm_pytools/llm.py @@ -96,6 +96,12 @@ fastllm_lib.embedding_sentence.argtypes = [ctypes.c_int, ctypes.c_char_p, ctypes.c_bool, ctypes.POINTER(ctypes.c_int)] fastllm_lib.embedding_sentence.restype = ctypes.POINTER(ctypes.c_float) +fastllm_lib.embedding_tokens.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int), ctypes.c_bool, ctypes.POINTER(ctypes.c_int)] +fastllm_lib.embedding_tokens.restype = ctypes.POINTER(ctypes.c_float) + +fastllm_lib.reranker_compute_score.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int)] +fastllm_lib.reranker_compute_score.restype = ctypes.POINTER(ctypes.c_float) + def softmax(a): max_value = a[0] for i in a: @@ -1086,12 +1092,30 @@ def get_max_input_len(self): def embedding_sentence(self, input: str, normalize = True): embedding_len = ctypes.c_int(0) - embedding_c_float = fastllm_lib.embedding_sentence(self.model, input.encode(), normalize, embedding_len) + if (self.hf_tokenizer != None): + input_ids = self.hf_tokenizer(input, padding = True, truncation = True)['input_ids'] + embedding_c_float = fastllm_lib.embedding_tokens(self.model, len(input_ids), (ctypes.c_int * len(input_ids))(*input_ids), normalize, embedding_len) + else: + embedding_c_float = fastllm_lib.embedding_sentence(self.model, input.encode(), normalize, embedding_len) embedding = [] for i in range(embedding_len.value): embedding.append(embedding_c_float[i]) #print("{:.7f}".format(embedding[i]), end=" ") return embedding + + def reranker_compute_score(self, pairs: List): + batch = len(pairs) + seq_lens = [] + tokens = [] + for i in range(batch): + input_ids = self.hf_tokenizer(pairs[i : i + 1], padding = True, truncation = True)['input_ids'][0] + seq_lens.append(len(input_ids)) + tokens += input_ids + ret_c = fastllm_lib.reranker_compute_score(self.model, batch, (ctypes.c_int * len(seq_lens))(*seq_lens), (ctypes.c_int * len(tokens))(*tokens)) + ret = [] + for i in range(batch): + ret.append(ret_c[i]) + return ret def GraphNode(name: str, type: str = "data", diff --git a/tools/src/pytools.cpp b/tools/src/pytools.cpp index 8bd133a..23e1878 100644 --- a/tools/src/pytools.cpp +++ b/tools/src/pytools.cpp @@ -435,4 +435,35 @@ extern "C" { *embeddingLen = result.size(); return fvalue; } + + DLL_EXPORT float* embedding_tokens(int modelId, int inputLen, int *input, bool normalize, int *embeddingLen) { + fastllm::BertModel *model = (fastllm::BertModel*)models.GetModel(modelId); + std::vector tokens; + for (int i = 0; i < inputLen; i++) { + tokens.push_back(input[i]); + } + std::vector result = model->EmbeddingSentence(tokens, normalize); + float *fvalue = new float[result.size()]; + memcpy(fvalue, result.data(), result.size() * sizeof(float)); + *embeddingLen = result.size(); + return fvalue; + } + + DLL_EXPORT float* reranker_compute_score(int modelId, int batch, int *seqLens, int *tokens) { + fastllm::XlmRobertaModel *model = (fastllm::XlmRobertaModel*)models.GetModel(modelId); + std::vector > inputIds; + inputIds.resize(batch); + int pos = 0; + for (int i = 0; i < batch; i++) { + for (int j = 0; j < seqLens[i]; j++) { + inputIds[i].push_back(tokens[pos++]); + } + } + auto ret = model->ComputeScore(inputIds); + float *fvalue = new float[batch]; + for (int i = 0; i < batch; i++) { + fvalue[i] = ret[i]; + } + return fvalue; + } };