From 13d30f82daa8423e380c08b29e83ce209bd35bcc Mon Sep 17 00:00:00 2001 From: LongxingTan Date: Tue, 17 Sep 2024 19:49:34 +0800 Subject: [PATCH] docs: update the fine-tuning examples --- README.md | 117 +++++++++++++--- README_ja-JP.md | 26 ++-- README_zh-CN.md | 132 +++++++++++++++--- docs/source/embed.rst | 43 +++--- docs/source/index.rst | 14 +- docs/source/quick-start.rst | 24 ++-- docs/source/rerank.rst | 29 +++- examples/0_embedding/README.md | 12 +- examples/0_embedding/train_llm.py | 5 +- examples/0_embedding/train_pairwise.py | 2 +- examples/2_reranking/train_llm.py | 186 ++++++++++--------------- examples/README.md | 3 +- examples/README_zh_CN.md | 3 +- src/retrievals/tools/langchain.py | 2 +- 14 files changed, 381 insertions(+), 217 deletions(-) diff --git a/README.md b/README.md index 9b9e6be..2185833 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,9 @@ ![structure](./docs/source/_static/structure.png) **Open-retrievals** unify text embedding, retrieval, reranking and RAG. It's easy, flexible and scalable. -- Embedding fine-tuned through point-wise, pairwise, listwise, contrastive learning, and LLM. -- Reranking fine-tuned with Cross Encoder, ColBERT, and LLM. -- Easily build enhanced modular RAG, integrated with Transformers, Langchain, and LlamaIndex. +- Embedding fine-tuned through point-wise, pairwise, listwise, contrastive learning and LLM. +- Reranking fine-tuned with Cross-Encoder, ColBERT and LLM. +- Easily build enhanced modular RAG, integrated with Transformers, Langchain and LlamaIndex. | Experiment | Model | Original | Finetuned | Demo | |-------------------------------|------------------------|----------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -48,7 +48,7 @@ | **rerank** colbert | bge-m3 | 0.657 | **0.695** | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QVtqhQ080ZMltXoJyODMmvEQYI6oo5kO?usp=sharing) | | **rerank** LLM (LoRA) | bge-reranker-v2-gemma | 0.637 | **0.706** | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fzq1iV7-f8hNKFnjMmpVhVxadqPb9IXk?usp=sharing) | -* The metrics is MAP in 10% eval [t2-reranking data](https://huggingface.co/datasets/C-MTEB/T2Reranking). +* The eval metrics is MAP in 10% [t2-reranking data](https://huggingface.co/datasets/C-MTEB/T2Reranking). * Read [more examples](./examples) @@ -76,7 +76,7 @@ python -m pip install -U git+https://github.com/LongxingTan/open-retrievals.git [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-WBMisdWLeHUKlzJ2DrREXY_kSV8vjP3?usp=sharing) -
Embeddings from pretrained weights +
Embedding from pretrained weights ```python from retrievals import AutoModelForEmbedding @@ -89,7 +89,7 @@ sentences = [ ] model_name_or_path = 'intfloat/e5-base-v2' model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") -embeddings = model.encode(sentences, normalize_embeddings=True, convert_to_tensor=True) +embeddings = model.encode(sentences, normalize_embeddings=True) scores = (embeddings[:2] @ embeddings[2:].T) * 100 print(scores.tolist()) ``` @@ -103,7 +103,7 @@ from retrievals import AutoModelForEmbedding, AutoModelForRetrieval sentences = ['A dog is chasing car.', 'A man is playing a guitar.'] model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2" index_path = './database/faiss/faiss.index' -model = AutoModelForEmbedding.from_pretrained(model_name_or_path) +model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method='mean') model.build_index(sentences, index_path=index_path) query_embed = model.encode("He plays guitar.") @@ -216,7 +216,7 @@ epochs: int = 3 train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train'] train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'}) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) -model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls") +model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") model = model.set_train_type('pairwise') optimizer = AdamW(model.parameters(), lr=5e-5) @@ -252,14 +252,22 @@ import torch.nn as nn from datasets import load_dataset from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup, TrainingArguments from retrievals import AutoModelForEmbedding, RetrievalTrainer, PairCollator, TripletCollator -from retrievals.losses import ArcFaceAdaptiveMarginLoss, InfoNCE, SimCSE, TripletLoss +from retrievals.losses import InfoNCE, SimCSE, TripletLoss + +def add_instructions(example): + example['query'] = query_instruction + example['query'] + example['positive'] = document_instruction + example['positive'] + return example model_name_or_path: str = "Qwen/Qwen2-1.5B-Instruct" batch_size: int = 8 epochs: int = 3 +query_instruction = "Retrieve relevant passages that answer the query\nQuery: " +document_instruction = "Document: " train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train'] train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'}) +train_dataset = train_dataset.map(add_instructions) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="last", use_lora=True) model = model.set_train_type('pairwise', loss_fn=InfoNCE(nn.CrossEntropyLoss(label_smoothing=0.05))) @@ -272,6 +280,7 @@ training_arguments = TrainingArguments( num_train_epochs=epochs, per_device_train_batch_size=batch_size, remove_unused_columns=False, + logging_steps=100, ) trainer = RetrievalTrainer( model=model, @@ -291,25 +300,32 @@ trainer.train() from transformers import AutoTokenizer, TrainingArguments, get_cosine_schedule_with_warmup, AdamW from retrievals import RerankCollator, AutoModelForRanking, RerankTrainer, RerankTrainDataset -model_name_or_path: str = "microsoft/deberta-v3-base" +model_name_or_path: str = "BAAI/bge-reranker-base" max_length: int = 128 learning_rate: float = 3e-5 batch_size: int = 4 epochs: int = 3 +output_dir: str = "./checkpoints" -train_dataset = RerankTrainDataset('./t2rank.json', positive_key='pos', negative_key='neg') +train_dataset = RerankTrainDataset("C-MTEB/T2Reranking", positive_key="positive", negative_key="negative", dataset_split='dev') tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) model = AutoModelForRanking.from_pretrained(model_name_or_path) optimizer = AdamW(model.parameters(), lr=learning_rate) num_train_steps = int(len(train_dataset) / batch_size * epochs) -scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) +scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=0.05 * num_train_steps, + num_training_steps=num_train_steps, +) training_args = TrainingArguments( learning_rate=learning_rate, per_device_train_batch_size=batch_size, num_train_epochs=epochs, - output_dir='./checkpoints', + output_dir=output_dir, remove_unused_columns=False, + logging_steps=100, + report_to="none", ) trainer = RerankTrainer( model=model, @@ -348,9 +364,7 @@ epochs: int = 3 colbert_dim: int = 1024 output_dir: str = './checkpoints' -train_dataset = RetrievalTrainDataset( - 'C-MTEB/T2Reranking', positive_key='positive', negative_key='negative', dataset_split='dev' -) +train_dataset = RetrievalTrainDataset('C-MTEB/T2Reranking', positive_key='positive', negative_key='negative', dataset_split='dev') tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) data_collator = ColBertCollator( tokenizer, @@ -367,9 +381,7 @@ model = ColBERT.from_pretrained( optimizer = AdamW(model.parameters(), lr=learning_rate) num_train_steps = int(len(train_dataset) / batch_size * epochs) -scheduler = get_cosine_schedule_with_warmup( - optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps -) +scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) training_args = TrainingArguments( learning_rate=learning_rate, @@ -394,7 +406,74 @@ trainer.train()
Fine-tune LLM reranking ```python +from transformers import ( + AdamW, + AutoTokenizer, + TrainingArguments, + get_cosine_schedule_with_warmup, +) +from retrievals import ( + LLMRanker, + LLMRerankCollator, + RerankTrainer, + RetrievalTrainDataset, +) +from retrievals.losses import TokenLoss + +model_name_or_path: str = "Qwen/Qwen2-1.5B-Instruct" +max_length: int = 512 +learning_rate: float = 3e-5 +batch_size: int = 8 +epochs: int = 3 +task_prompt: str = ( + """Given a query A and a passage B, determine whether the passage contains an answer to the query""" + """by providing a prediction of either 'Yes' or 'No'.""" +) + +tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) +train_dataset = RetrievalTrainDataset( + data_name_or_path='C-MTEB/T2Reranking', + positive_key='positive', + negative_key='negative', + query_instruction='A: ', + document_instruction='B: ', + dataset_split='dev', +) +data_collator = LLMRerankCollator(tokenizer=tokenizer, max_length=max_length, prompt=task_prompt, add_target_token='Yes') +token_index = tokenizer('Yes', add_special_tokens=False)['input_ids'][-1] +model = LLMRanker.from_pretrained( + model_name_or_path, + causal_lm=True, + use_fp16=True, + loss_fn=TokenLoss(token_index=token_index), + use_lora=True, +) + +optimizer = AdamW(model.parameters(), lr=learning_rate) +num_train_steps = int(len(train_dataset) / batch_size * epochs) +scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=0.05 * num_train_steps, + num_training_steps=num_train_steps, +) + +training_args = TrainingArguments( + learning_rate=learning_rate, + per_device_train_batch_size=batch_size, + num_train_epochs=epochs, + output_dir="./checkpoints", + remove_unused_columns=False, +) +trainer = RerankTrainer( + model=model, + args=training_args, + train_dataset=train_dataset, + data_collator=data_collator, +) +trainer.optimizer = optimizer +trainer.scheduler = scheduler +trainer.train() ```
diff --git a/README_ja-JP.md b/README_ja-JP.md index 5a3e927..bcd8a57 100644 --- a/README_ja-JP.md +++ b/README_ja-JP.md @@ -79,7 +79,7 @@ sentences = [ ] model_name_or_path = 'intfloat/e5-base-v2' model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") -embeddings = model.encode(sentences, normalize_embeddings=True, convert_to_tensor=True) +embeddings = model.encode(sentences, normalize_embeddings=True) scores = (embeddings[:2] @ embeddings[2:].T) * 100 print(scores.tolist()) ``` @@ -91,7 +91,7 @@ from retrievals import AutoModelForEmbedding, AutoModelForRetrieval sentences = ['A dog is chasing car.', 'A man is playing a guitar.'] model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2" index_path = './database/faiss/faiss.index' -model = AutoModelForEmbedding.from_pretrained(model_name_or_path) +model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method='mean') model.build_index(sentences, index_path=index_path) query_embed = model.encode("He plays guitar.") @@ -199,8 +199,9 @@ epochs: int = 3 train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train'] train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'document'}) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) -model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls") -# model = model.set_train_type('pointwise') # 'pointwise', 'pairwise', 'listwise' +model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") +model = model.set_train_type('pairwise') + optimizer = AdamW(model.parameters(), lr=5e-5) num_train_steps = int(len(train_dataset) / batch_size * epochs) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) @@ -240,25 +241,34 @@ model = AutoModelForEmbedding.from_pretrained( from transformers import AutoTokenizer, TrainingArguments, get_cosine_schedule_with_warmup, AdamW from retrievals import RerankCollator, AutoModelForRanking, RerankTrainer, RerankTrainDataset -model_name_or_path: str = "microsoft/deberta-v3-base" +model_name_or_path: str = "BAAI/bge-reranker-base" max_length: int = 128 learning_rate: float = 3e-5 batch_size: int = 4 epochs: int = 3 +output_dir: str = "./checkpoints" -train_dataset = RerankTrainDataset('./t2rank.json', positive_key='pos', negative_key='neg') +train_dataset = RerankTrainDataset( + "C-MTEB/T2Reranking", positive_key="positive", negative_key="negative", dataset_split='dev' +) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) model = AutoModelForRanking.from_pretrained(model_name_or_path) optimizer = AdamW(model.parameters(), lr=learning_rate) num_train_steps = int(len(train_dataset) / batch_size * epochs) -scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) +scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=0.05 * num_train_steps, + num_training_steps=num_train_steps, +) training_args = TrainingArguments( learning_rate=learning_rate, per_device_train_batch_size=batch_size, num_train_epochs=epochs, - output_dir='./checkpoints', + output_dir=output_dir, remove_unused_columns=False, + logging_steps=100, + report_to="none", ) trainer = RerankTrainer( model=model, diff --git a/README_zh-CN.md b/README_zh-CN.md index 58377b9..e3ecbe4 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -36,7 +36,7 @@ **Open-Retrievals** 支持统一调用或微调文本向量、检索、重排等模型,使信息检索、RAG应用更加便捷 - 支持全套向量微调,对比学习、大模型、point-wise、pairwise、listwise -- 支持全套重排微调,cross encoder、ColBERT、LLM +- 支持全套重排微调,cross-encoder、ColBERT、LLM - 支持定制化、模块化RAG,支持在Transformers、Langchain、LlamaIndex中便捷使用微调后的模型 | 实验 | 模型 | 原分数 | 微调分数 | Demo代码 | @@ -48,8 +48,8 @@ | 大模型重排 | bge-reranker-v2-gemma | 0.637 | **0.706** | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fzq1iV7-f8hNKFnjMmpVhVxadqPb9IXk?usp=sharing) | -* 指标为 [t2-reranking 10% 测试数据](https://huggingface.co/datasets/C-MTEB/T2Reranking) MAP -* 阅读[更多示例](./examples/README_zh_CN.md) +* 测试指标为10%[t2-reranking数据](https://huggingface.co/datasets/C-MTEB/T2Reranking)的MAP +* 阅读[更多实例](./examples/README_zh_CN.md) ## 安装 @@ -89,8 +89,8 @@ sentences = [ ] model_name_or_path = 'intfloat/multilingual-e5-base' -model = AutoModelForEmbedding.from_pretrained(model_name_or_path) -embeddings = model.encode(sentences) # 384维度的文本向量 +model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") +embeddings = model.encode(sentences, normalize_embeddings=True) # 384维度的文本向量 scores = (embeddings[:2] @ embeddings[2:].T) * 100 print(scores.tolist()) ``` @@ -105,7 +105,7 @@ from retrievals import AutoModelForEmbedding, AutoModelForRetrieval index_path = './database/faiss/faiss.index' sentences = ['在中国是中国人', '在美国是美国人', '2000人民币大于3000美元'] model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2" -model = AutoModelForEmbedding.from_pretrained(model_name_or_path) +model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method='mean') model.build_index(sentences, index_path=index_path) query_embed = model.encode("在加拿大是加拿大人") @@ -208,14 +208,14 @@ print(response)
微调向量模型 -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1w2dRoRThG6DnUW46swqEUuWySKS1AXCp?usp=sharing) - ```python +import os import torch.nn as nn from datasets import load_dataset from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup, TrainingArguments from retrievals import AutoModelForEmbedding, RetrievalTrainer, PairCollator, TripletCollator from retrievals.losses import ArcFaceAdaptiveMarginLoss, InfoNCE, SimCSE, TripletLoss +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" batch_size: int = 32 @@ -224,7 +224,7 @@ epochs: int = 3 train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train'] train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'}) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) -model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls") +model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") model = model.set_train_type('pairwise') optimizer = AdamW(model.parameters(), lr=5e-5) @@ -257,18 +257,28 @@ trainer.train()
微调LLM向量模型 ```python +import os import torch.nn as nn from datasets import load_dataset from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup, TrainingArguments from retrievals import AutoModelForEmbedding, RetrievalTrainer, PairCollator, TripletCollator -from retrievals.losses import ArcFaceAdaptiveMarginLoss, InfoNCE, SimCSE, TripletLoss +from retrievals.losses import InfoNCE, SimCSE, TripletLoss +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' + +def add_instructions(example): + example['query'] = query_instruction + example['query'] + example['positive'] = document_instruction + example['positive'] + return example model_name_or_path: str = "Qwen/Qwen2-1.5B-Instruct" batch_size: int = 8 epochs: int = 3 +query_instruction = "Retrieve relevant passages that answer the query\nQuery: " +document_instruction = "Document: " train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train'] train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'}) +train_dataset = train_dataset.map(add_instructions) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="last", use_lora=True) model = model.set_train_type('pairwise', loss_fn=InfoNCE(nn.CrossEntropyLoss(label_smoothing=0.05))) @@ -281,6 +291,7 @@ training_arguments = TrainingArguments( num_train_epochs=epochs, per_device_train_batch_size=batch_size, remove_unused_columns=False, + logging_steps=100, ) trainer = RetrievalTrainer( model=model, @@ -298,28 +309,39 @@ trainer.train()
微调Cross-encoder重排模型 ```python +import os from transformers import AutoTokenizer, TrainingArguments, get_cosine_schedule_with_warmup, AdamW from retrievals import RerankCollator, AutoModelForRanking, RerankTrainer, RerankTrainDataset +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' -model_name_or_path: str = "microsoft/deberta-v3-base" +model_name_or_path: str = "BAAI/bge-reranker-base" max_length: int = 128 learning_rate: float = 3e-5 batch_size: int = 4 epochs: int = 3 +output_dir: str = "./checkpoints" -train_dataset = RerankTrainDataset('./t2rank.json', positive_key='pos', negative_key='neg') +train_dataset = RerankTrainDataset( + "C-MTEB/T2Reranking", positive_key="positive", negative_key="negative", dataset_split='dev' +) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) model = AutoModelForRanking.from_pretrained(model_name_or_path) optimizer = AdamW(model.parameters(), lr=learning_rate) num_train_steps = int(len(train_dataset) / batch_size * epochs) -scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) +scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=0.05 * num_train_steps, + num_training_steps=num_train_steps, +) training_args = TrainingArguments( learning_rate=learning_rate, per_device_train_batch_size=batch_size, num_train_epochs=epochs, - output_dir='./checkpoints', + output_dir=output_dir, remove_unused_columns=False, + logging_steps=100, + report_to="none", ) trainer = RerankTrainer( model=model, @@ -350,6 +372,7 @@ from retrievals import ColBERT, ColBertCollator, RerankTrainer, RetrievalTrainDa from retrievals.losses import ColbertLoss transformers.logging.set_verbosity_error() +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' os.environ["WANDB_DISABLED"] = "true" model_name_or_path: str = "BAAI/bge-m3" @@ -359,9 +382,7 @@ epochs: int = 3 colbert_dim: int = 1024 output_dir: str = './checkpoints' -train_dataset = RetrievalTrainDataset( - 'C-MTEB/T2Reranking', positive_key='positive', negative_key='negative', dataset_split='dev' -) +train_dataset = RetrievalTrainDataset('C-MTEB/T2Reranking', positive_key='positive', negative_key='negative', dataset_split='dev') tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) data_collator = ColBertCollator( tokenizer, @@ -378,9 +399,7 @@ model = ColBERT.from_pretrained( optimizer = AdamW(model.parameters(), lr=learning_rate) num_train_steps = int(len(train_dataset) / batch_size * epochs) -scheduler = get_cosine_schedule_with_warmup( - optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps -) +scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) training_args = TrainingArguments( learning_rate=learning_rate, @@ -403,10 +422,81 @@ trainer.train()
-
微调LLM重排模型 +
微调大模型重排模型 ```python +import os +from transformers import ( + AdamW, + AutoTokenizer, + TrainingArguments, + get_cosine_schedule_with_warmup, +) + +from retrievals import ( + LLMRanker, + LLMRerankCollator, + RerankTrainer, + RetrievalTrainDataset, +) +from retrievals.losses import TokenLoss +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' +model_name_or_path: str = "Qwen/Qwen2-1.5B-Instruct" +max_length: int = 512 +learning_rate: float = 3e-5 +batch_size: int = 8 +epochs: int = 3 +task_prompt: str = ( + """Given a query A and a passage B, determine whether the passage contains an answer to the query""" + """by providing a prediction of either 'Yes' or 'No'.""" +) + +tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) +train_dataset = RetrievalTrainDataset( + data_name_or_path='C-MTEB/T2Reranking', + positive_key='positive', + negative_key='negative', + query_instruction='A: ', + document_instruction='B: ', + dataset_split='dev', +) +data_collator = LLMRerankCollator( + tokenizer=tokenizer, max_length=max_length, prompt=task_prompt, add_target_token='Yes' +) +token_index = tokenizer('Yes', add_special_tokens=False)['input_ids'][-1] +model = LLMRanker.from_pretrained( + model_name_or_path, + causal_lm=True, + use_fp16=True, + loss_fn=TokenLoss(token_index=token_index), + use_lora=True, +) + +optimizer = AdamW(model.parameters(), lr=learning_rate) +num_train_steps = int(len(train_dataset) / batch_size * epochs) +scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=0.05 * num_train_steps, + num_training_steps=num_train_steps, +) + +training_args = TrainingArguments( + learning_rate=learning_rate, + per_device_train_batch_size=batch_size, + num_train_epochs=epochs, + output_dir="./checkpoints", + remove_unused_columns=False, +) +trainer = RerankTrainer( + model=model, + args=training_args, + train_dataset=train_dataset, + data_collator=data_collator, +) +trainer.optimizer = optimizer +trainer.scheduler = scheduler +trainer.train() ```
diff --git a/docs/source/embed.rst b/docs/source/embed.rst index 1865343..52b932d 100644 --- a/docs/source/embed.rst +++ b/docs/source/embed.rst @@ -6,9 +6,9 @@ Embedding 1. Use embedding from open-retrievals --------------------------------------- -we can use `AutoModelForEmbedding` to get the sentence embedding from pretrained transformer or large language model. +we can use `AutoModelForEmbedding` to get the text embedding from pretrained transformer or LLM. -The Transformer model could get the representation vector from a sentence. +The Transformer model could get a representation vector from a sentence. **Transformer encoder embedding model** @@ -20,7 +20,6 @@ The Transformer model could get the representation vector from a sentence. from retrievals import AutoModelForEmbedding model = AutoModelForEmbedding.from_pretrained('moka-ai/m3e-base', pooling_method='mean') - sentences = [ '* Moka 此文本嵌入模型由 MokaAI 训练并开源,训练脚本使用 uniem', '* Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练', @@ -40,6 +39,8 @@ The Transformer model could get the representation vector from a sentence. model_name, pooling_method='last', use_fp16=True, + query_instruction='Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: ', + document_instruction='', ) .. code:: @@ -95,7 +96,7 @@ Pair wise train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train'] train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'}) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) - model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls") + model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") optimizer = AdamW(model.parameters(), lr=5e-5) num_train_steps=int(len(train_dataset) / batch_size * epochs) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) @@ -118,22 +119,6 @@ Pair wise trainer.train() -Point wise -~~~~~~~~~~~~~~~~~~ - -If the positive and negative examples have some noise in label, the directly point-wise cross-entropy maybe not the best. The pair wise just compare relatively, or the hinge loss with margin could be better. - -arcface - -- layer wise learning rate -- batch size is important -- dynamic arcface_margin, margin is important -- arc_weight init - - -List wise -~~~~~~~~~~~~~~~~~~ - **Pairwise fine-tune embedding model** .. code-block:: shell @@ -200,6 +185,24 @@ List wise --save_total_limit 1 +Point wise +~~~~~~~~~~~~~~~~~~ + +If the positive and negative examples have some noise in label, the directly point-wise cross-entropy maybe not the best. The pair wise just compare relatively, or the hinge loss with margin could be better. + +arcface + +- layer wise learning rate +- batch size is important +- dynamic arcface_margin, margin is important +- arc_weight init + + +List wise +~~~~~~~~~~~~~~~~~~ + + + 3. Training skills to enhance the performance ---------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 42503c0..5bb9097 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,10 +4,11 @@ Open-Retrievals Documentation GitHub -Retrievals is an easy, flexible, scalable framework supporting state-of-the-art embeddings, retrieval and reranking for information retrieval or RAG, based on PyTorch and Transformers. +Retrievals is an easy, flexible, scalable framework supporting state-of-the-art embeddings, retrieval and reranking for information retrieval or RAG. -* Embeddings fine-tuned by Contrastive learning -* Embeddings from LLM model +* Embedding fine-tuned through point-wise, pairwise, listwise, contrastive learning and LLM. +* Reranking fine-tuned with Cross-Encoder, ColBERT and LLM. +* Easily build enhanced modular RAG, integrated with Transformers, Langchain and LlamaIndex. Installation @@ -43,13 +44,13 @@ Run a simple example sentences = ["Hello NLP", "Open-retrievals is designed for retrieval, rerank and RAG"] model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2" model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") - sentence_embeddings = model.encode(sentences, normalize_embeddings=True, convert_to_tensor=True) + sentence_embeddings = model.encode(sentences, normalize_embeddings=True) print(sentence_embeddings) Open-retrievals support to fine-tune the embedding model, reranking model, llm easily for custom usage. -* `Pairwise embedding fine-tuning `_ -* `Pairwise LLM embedding fine-tuning `_ +* `Embedding pairwise fine-tuning `_ +* `LLM embedding pairwise fine-tuning `_ * `ColBERT fine-tuning `_ * `Cross-encoder reranking fine-tuning `_ * `LLM reranking fine-tuning `_ @@ -62,7 +63,6 @@ More datasets examples * `msmacro dataset `_ * `wikipedia nq dataset `_ * `rag example `_ -* `graph rag example `_ Contributing diff --git a/docs/source/quick-start.rst b/docs/source/quick-start.rst index c0011e8..1901410 100644 --- a/docs/source/quick-start.rst +++ b/docs/source/quick-start.rst @@ -13,7 +13,7 @@ We can use Open-retrievals to easily fine-tune models of information retrieval a 1. Embedding ----------------------------- -Use the pretrained embedding +Embedding from the pretrained model .. code-block:: python @@ -26,7 +26,7 @@ Use the pretrained embedding "passage: Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level." ] model_name_or_path = 'intfloat/e5-base-v2' - # sentence embedding mode + # sentence embedding model model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") # encode the sentence to embedding vector embeddings = model.encode(sentences, normalize_embeddings=True, convert_to_tensor=True) @@ -58,7 +58,7 @@ To further improve the retrieval performance, we can fine tune the embedding mod train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train'] train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'document'}) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) - model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls") + model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") optimizer = AdamW(model.parameters(), lr=5e-5) num_train_steps = int(len(train_dataset) / batch_size * epochs) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) @@ -96,7 +96,7 @@ Save the document embedding offline using the vector database. sentences = ['A dog is chasing car.', 'A man is playing a guitar.'] model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2" index_path = './database/faiss/faiss.index' - model = AutoModelForEmbedding.from_pretrained(model_name_or_path) + model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method='mean) model.build_index(sentences, index_path=index_path) query_embed = model.encode("He plays guitar.") @@ -118,7 +118,6 @@ If we have multiple retrieval source or a better sequence, we can add the rerank ["In 1974, I won the championship in Southeast Asia in my first kickboxing match", "In 1982, I defeated the heavy hitter Ryu Long."], ['A dog is chasing car.', 'A man is playing a guitar.'], ] - model_name_or_path: str = "BAAI/bge-reranker-base" rerank_model = AutoModelForRanking.from_pretrained(model_name_or_path) scores_list = rerank_model.compute_score(sentences) @@ -139,25 +138,32 @@ Similarly, fine tune the reranking model to get a better performance for the spe from transformers import AutoTokenizer, TrainingArguments, get_cosine_schedule_with_warmup, AdamW from retrievals import RerankCollator, AutoModelForRanking, RerankTrainer, RerankTrainDataset - model_name_or_path: str = "microsoft/deberta-v3-base" + model_name_or_path: str = "BAAI/bge-reranker-base" max_length: int = 128 learning_rate: float = 3e-5 batch_size: int = 4 epochs: int = 3 + output_dir: str = "./checkpoints" - train_dataset = RerankTrainDataset('./t2rank.json', positive_key='pos', negative_key='neg') + train_dataset = RerankTrainDataset("C-MTEB/T2Reranking", positive_key="positive", negative_key="negative", dataset_split='dev') tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) model = AutoModelForRanking.from_pretrained(model_name_or_path) optimizer = AdamW(model.parameters(), lr=learning_rate) num_train_steps = int(len(train_dataset) / batch_size * epochs) - scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) + scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=0.05 * num_train_steps, + num_training_steps=num_train_steps, + ) training_args = TrainingArguments( learning_rate=learning_rate, per_device_train_batch_size=batch_size, num_train_epochs=epochs, - output_dir='./checkpoints', + output_dir=output_dir, remove_unused_columns=False, + logging_steps=100, + report_to="none", ) trainer = RerankTrainer( model=model, diff --git a/docs/source/rerank.rst b/docs/source/rerank.rst index f55bd71..54ee0e8 100644 --- a/docs/source/rerank.rst +++ b/docs/source/rerank.rst @@ -60,6 +60,16 @@ Rerank from retrievals import LLMRanker + model_name = 'BAAI/bge-reranker-v2-gemma' + model = LLMRanker.from_pretrained( + model_name, + causal_lm=True, + use_fp16=True, + ) + + scores = model.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]) + print('Ranking score: ', scores) + 2. Fine-tune cross-encoder reranking model ----------------------------------------------- @@ -74,31 +84,38 @@ Rerank from transformers import AutoTokenizer, TrainingArguments, get_cosine_schedule_with_warmup, AdamW from retrievals import RerankCollator, AutoModelForRanking, RerankTrainer, RerankTrainDataset - model_name_or_path: str = "microsoft/deberta-v3-base" + model_name_or_path: str = "BAAI/bge-reranker-base" max_length: int = 128 learning_rate: float = 3e-5 batch_size: int = 4 epochs: int = 3 + output_dir: str = "./checkpoints" - train_dataset = RerankTrainDataset('./t2rank.json', positive_key='pos', negative_key='neg') + train_dataset = RerankTrainDataset("C-MTEB/T2Reranking", positive_key="positive", negative_key="negative", dataset_split='dev') tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) - model = AutoModelForRanking.from_pretrained(model_name_or_path, pooling_method="mean") + model = AutoModelForRanking.from_pretrained(model_name_or_path) optimizer = AdamW(model.parameters(), lr=learning_rate) num_train_steps = int(len(train_dataset) / batch_size * epochs) - scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps) + scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=0.05 * num_train_steps, + num_training_steps=num_train_steps, + ) training_args = TrainingArguments( learning_rate=learning_rate, per_device_train_batch_size=batch_size, num_train_epochs=epochs, - output_dir = './checkpoints', + output_dir=output_dir, remove_unused_columns=False, + logging_steps=100, + report_to="none", ) trainer = RerankTrainer( model=model, args=training_args, train_dataset=train_dataset, - data_collator=RerankCollator(tokenizer, query_max_length=max_length, document_max_length=max_length), + data_collator=RerankCollator(tokenizer, max_length=max_length), ) trainer.optimizer = optimizer trainer.scheduler = scheduler diff --git a/examples/0_embedding/README.md b/examples/0_embedding/README.md index aad3109..bfa66a1 100644 --- a/examples/0_embedding/README.md +++ b/examples/0_embedding/README.md @@ -48,7 +48,7 @@ Refer to [the fine-tuning code](./train_llm.py), to train the model like Note - no need to set `causal_lm=True` like LLMRanker for AutoModelForEmbedding, but normally set the pooling_method to `last` -- set `query_instruction` +- set `query_instruction` and `document_instruction` in `RetrievalTrainDataset` during train or add it manually to text directly, set it in `AutoModelForEmbedding` during encode - "Given a query and a relevant document, retrieve the document that are pertinent to the query\nQuery: " - use the appropriate `pooling_method` - `last` @@ -60,8 +60,10 @@ from retrievals import AutoModelForEmbedding model_name = 'intfloat/e5-mistral-7b-instruct' model = AutoModelForEmbedding.from_pretrained( - model_name, - pooling_method='last', - use_fp16=True, - ) + model_name, + pooling_method='last', + use_fp16=True, + query_instruction='Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: ', + document_instruction='', +) ``` diff --git a/examples/0_embedding/train_llm.py b/examples/0_embedding/train_llm.py index fffa70f..e41139a 100644 --- a/examples/0_embedding/train_llm.py +++ b/examples/0_embedding/train_llm.py @@ -89,7 +89,7 @@ class TrainingArguments(transformers.TrainingArguments): num_train_epochs: int = 1 per_device_train_batch_size: int = 1 remove_unused_columns: bool = False - cache_dir: Optional[str] = field(default="/root/autodl-tmp/llm_output") + cache_dir: Optional[str] = None negatives_cross_device: bool = field(default=False, metadata={"help": "share negatives across devices"}) temperature: Optional[float] = field(default=0.02) fix_position_embedding: bool = field( @@ -99,7 +99,8 @@ class TrainingArguments(transformers.TrainingArguments): normalized: bool = field(default=True) use_inbatch_neg: bool = field(default=True, metadata={"help": "Freeze the parameters of position embeddings"}) gradient_accumulation_steps: int = field(default=1024) - fp16: bool = field(default=True) + bf16: bool = field(default=True) + logging_steps: int = field(default=100) @dataclass diff --git a/examples/0_embedding/train_pairwise.py b/examples/0_embedding/train_pairwise.py index ba3d345..5f398c7 100644 --- a/examples/0_embedding/train_pairwise.py +++ b/examples/0_embedding/train_pairwise.py @@ -22,7 +22,7 @@ def train(): train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train'] train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'}) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) - model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls") + model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="mean") model = model.set_train_type('pairwise') optimizer = AdamW(model.parameters(), lr=5e-5) diff --git a/examples/2_reranking/train_llm.py b/examples/2_reranking/train_llm.py index 8e67104..6e4c198 100644 --- a/examples/2_reranking/train_llm.py +++ b/examples/2_reranking/train_llm.py @@ -1,146 +1,100 @@ """LLM Reranker fine-tuning""" -from dataclasses import dataclass, field -from typing import Optional - from transformers import ( AdamW, AutoTokenizer, - HfArgumentParser, TrainingArguments, get_cosine_schedule_with_warmup, - get_linear_schedule_with_warmup, ) from retrievals import ( - AutoModelForRanking, + LLMRanker, LLMRerankCollator, RerankTrainer, RetrievalTrainDataset, ) from retrievals.losses import TokenLoss +model_name_or_path: str = "Qwen/Qwen2-1.5B-Instruct" +max_length: int = 256 +learning_rate: float = 1e-5 +batch_size: int = 8 +epochs: int = 3 +task_prompt: str = ( + """Given a query A and a passage B, determine whether the passage contains an answer to the query""" + """by providing a prediction of either 'Yes' or 'No'.""" +) -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. - """ - model_name_or_path: str = field( - metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} +def train(): + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) + train_dataset = RetrievalTrainDataset( + data_name_or_path='C-MTEB/T2Reranking', + positive_key='positive', + negative_key='negative', + query_instruction='A: ', + document_instruction='B: ', + dataset_split='dev', ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + data_collator = LLMRerankCollator( + tokenizer=tokenizer, max_length=max_length, prompt=task_prompt, add_target_token='Yes' ) - cache_dir: Optional[str] = field( - default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + token_index = tokenizer('Yes', add_special_tokens=False)['input_ids'][-1] + model = LLMRanker.from_pretrained( + model_name_or_path, + causal_lm=True, + loss_fn=TokenLoss(token_index=token_index), + use_lora=True, ) - causal_lm: bool = field(default=False, metadata={'help': "Whether the model is a causal lm or not"}) - -@dataclass -class DataArguments: - data_name_or_path: str = field(default=None, metadata={"help": "Path to corpus"}) - train_group_size: int = field(default=8) - unfold_each_positive: bool = field(default=False) - max_length: int = field( - default=512, - metadata={ - "help": "The maximum total input sequence length after tokenization for input text. Sequences longer " - "than this will be truncated, sequences shorter will be padded." - }, + optimizer = AdamW(model.parameters(), lr=learning_rate) + num_train_steps = int(len(train_dataset) / batch_size * epochs) + scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=0.05 * num_train_steps, + num_training_steps=num_train_steps, ) - query_key: str = field(default=None) - positive_key: str = field(default=None) - negative_key: str = field(default=None) - query_instruction: str = field(default=None, metadata={"help": "instruction for query"}) - document_instruction: str = field(default=None, metadata={"help": "instruction for document"}) - task_prompt: str = field( - default=( - "Given a query A and a passage B, determine whether the passage contains an answer " - "to the query by providing a prediction of either 'Yes' or 'No'." - ) + training_args = TrainingArguments( + learning_rate=learning_rate, + per_device_train_batch_size=batch_size, + num_train_epochs=epochs, + bf16=True, + output_dir="./checkpoints", + remove_unused_columns=False, + logging_steps=100, ) + trainer = RerankTrainer( + model=model, + args=training_args, + train_dataset=train_dataset, + data_collator=data_collator, + ) + trainer.optimizer = optimizer + trainer.scheduler = scheduler + trainer.train() -@dataclass -class RerankerTrainingArguments(TrainingArguments): - model_type: str = field(default='cross-encoder', metadata={'help': "train type of cross-encoder, colbert"}) - negatives_cross_device: bool = field(default=False, metadata={"help": "share negatives across devices"}) - use_inbatch_negative: bool = field(default=False) - temperature: Optional[float] = field(default=0.02) - remove_unused_columns: bool = field(default=False) - num_train_epochs: int = field(default=3) - use_lora: bool = field(default=False) - use_bnb_config: bool = field(default=False) - do_rerank: bool = field(default=False, metadata={"help": "run the reranking loop"}) - - -def get_optimizer(model, learning_rate, weight_decay=0.0): - optimizer_parameters = [ - { - "params": [p for n, p in model.model.named_parameters()], - "lr": learning_rate, - "weight_decay": weight_decay, - }, - { - "params": [p for n, p in model.named_parameters() if "model" not in n], - "lr": learning_rate * 20, - "weight_decay": 0.0, - }, - ] - return AdamW(optimizer_parameters) - - -parser = HfArgumentParser((ModelArguments, DataArguments, RerankerTrainingArguments)) -model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - -tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=False, -) - -train_dataset = RetrievalTrainDataset( - args=data_args, - tokenizer=tokenizer, - unfold_each_positive=data_args.unfold_each_positive, - train_group_size=data_args.train_group_size, - positive_key=data_args.positive_key, - negative_key=data_args.negative_key, -) -data_collator = LLMRerankCollator(tokenizer=tokenizer, max_length=data_args.max_length, prompt=data_args.task_prompt) -token_index = tokenizer('Yes', add_special_tokens=False)['input_ids'][-1] -model = AutoModelForRanking.from_pretrained( - model_args.model_name_or_path, - num_labels=1, - loss_fn=TokenLoss(token_index=token_index, train_group_size=data_args.train_group_size), - causal_lm=True, - use_lora=training_args.use_lora, - quantization_config=None, -) -optimizer = get_optimizer(model, learning_rate=training_args.learning_rate) +def predict(): + model_name = 'BAAI/bge-reranker-v2-gemma' -num_train_steps = int(len(train_dataset) / training_args.per_device_train_batch_size * training_args.num_train_epochs) -scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps -) + model = LLMRanker.from_pretrained( + model_name, + causal_lm=True, + use_fp16=True, + ) + scores = model.compute_score( + [ + ['what is panda?', 'hi'], + [ + 'what is panda?', + 'The giant panda, sometimes called a panda bear or simply panda, is a bear species endemic to China.', + ], + ] + ) + print(scores) -trainer = RerankTrainer( - model=model, - args=training_args, - train_dataset=train_dataset, - data_collator=data_collator, - tokenizer=tokenizer, -) -trainer.optimizer = optimizer -trainer.scheduler = scheduler -trainer.train() -model.save_pretrained(training_args.output_dir) +if __name__ == '__main__': + train() + predict() diff --git a/examples/README.md b/examples/README.md index b625350..e5b29c6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -25,10 +25,11 @@ 1. The grad_norm during training is always zero? - consider to change fp16 or bf16 +- while training, set `bf16` or `fp16` in `TrainingArguments`; while inference, set `use_fp16=True` in `AutoModelForEmbedding` or `LLMRanker` 2. The fine-tuned embedding performance during inference is worse than original? - check whether the pooling_method is correct -- check whether the prompt is exactly same as training for LLM model +- check whether the prompt or instruction is exactly same as training for LLM model 3. How can we fine-tune the `BAAI/bge-m3` ColBERT model? - open-retrievals support to fine-tune the `BAAI/bge-m3 colbert` directly, just don't set `use_fp16=True` while fine-tuning, and set the learning_rate smaller diff --git a/examples/README_zh_CN.md b/examples/README_zh_CN.md index 058d697..cfd2daf 100644 --- a/examples/README_zh_CN.md +++ b/examples/README_zh_CN.md @@ -60,10 +60,11 @@ export HF_ENDPOINT=https://hf-mirror.com 1. 训练过程中的 grad_norm 始终为零? - 考虑更改 fp16 或 bf16 +- 训练时,在`TrainingArguments`中设置`bf16`或`fp16`;推理时,在`AutoModelForEmbedding`或`LLMRanker`中设置`use_fp16=True` 2. 推理过程中微调后的嵌入性能比原始模型差? - 检查 pooling_method 是否正确 -- 检查 LLM 模型的提示词是否与训练时一致 +- 检查LLM大模型使用的提示词或instruction是否与训练时一致 3. 如何微调 `BAAI/bge-m3` ColBERT 模型? - open-retrievals 支持直接微调 `BAAI/bge-m3 colbert`,只需在微调时不设置 `use_fp16=True`,并将学习率设置得更小 diff --git a/src/retrievals/tools/langchain.py b/src/retrievals/tools/langchain.py index 798d8fa..9204b32 100644 --- a/src/retrievals/tools/langchain.py +++ b/src/retrievals/tools/langchain.py @@ -45,7 +45,7 @@ class LangchainEmbedding(AutoModelForEmbedding, Embeddings): model_kwargs: Dict[str, Any] = dict() encode_kwargs: Dict[str, Any] = dict() - def __init__(self, model_name=None, **model_kwargs): + def __init__(self, model_name, model_kwargs): Embeddings.__init__(self) self.model_kwargs = model_kwargs