Merge branch 'main' into main

huggingface · Oct 7, 2024 · 8186e93 · 8186e93
2 parents ace6591 + 51ca76b
commit 8186e93
Show file tree

Hide file tree

Showing 71 changed files with 700 additions and 445 deletions.
diff --git a/README.md b/README.md
@@ -133,7 +133,7 @@ training_args = RewardConfig(output_dir="Qwen2.5-0.5B-Reward", per_device_train_
 trainer = RewardTrainer(
     args=training_args,
     model=model,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     train_dataset=dataset,
 )
 trainer.train()
@@ -166,7 +166,7 @@ dataset = dataset.map(lambda x: tokenizer(x["prompt"]), remove_columns="prompt")
 training_args = RLOOConfig(output_dir="Qwen2.5-0.5B-RL")
 trainer = RLOOTrainer(
     config=training_args,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     policy=policy,
     ref_policy=ref_policy,
     reward_model=reward_model,
@@ -189,7 +189,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
 dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
 training_args = DPOConfig(output_dir="Qwen2.5-0.5B-DPO")
-trainer = DPOTrainer(model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer)
+trainer = DPOTrainer(model=model, args=training_args, train_dataset=dataset, processing_class=tokenizer)
 trainer.train()
 ```
 

diff --git a/docs/source/alignprop_trainer.mdx b/docs/source/alignprop_trainer.mdx
@@ -1,6 +1,6 @@
 # Aligning Text-to-Image Diffusion Models with Reward Backpropagation
 
-[![](https://img.shields.io/badge/All_models-AlignProp-blue)](https://huggingface.co/models?other=alignprop)
+[![](https://img.shields.io/badge/All_models-AlignProp-blue)](https://huggingface.co/models?other=alignprop,trl)
 
 ## The why
 

diff --git a/docs/source/bco_trainer.mdx b/docs/source/bco_trainer.mdx
@@ -1,6 +1,6 @@
 # BCO Trainer
 
-[![](https://img.shields.io/badge/All_models-BCO-blue)](https://huggingface.co/models?other=bco)
+[![](https://img.shields.io/badge/All_models-BCO-blue)](https://huggingface.co/models?other=bco,trl)
 
 TRL supports the Binary Classifier Optimization (BCO).
 The [BCO](https://huggingface.co/papers/2404.04656) authors train a binary classifier whose logit serves as a reward so that the classifier maps {prompt, chosen completion} pairs to 1 and {prompt, rejected completion} pairs to 0.
@@ -32,7 +32,7 @@ bco_trainer = BCOTrainer(
     model_ref,
     args=training_args,
     train_dataset=train_dataset,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
 )
 ```
 After this one can then call:
@@ -75,7 +75,7 @@ bco_trainer = BCOTrainer(
     model_ref,
     args=training_args,
     train_dataset=train_dataset,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     embedding_func=embedding_func,
     embedding_tokenizer=self.embedding_tokenizer,
 )

diff --git a/docs/source/clis.mdx b/docs/source/clis.mdx
@@ -96,24 +96,26 @@ python examples/datasets/anthropic_hh.py --push_to_hub --hf_entity your-hf-org
 
 The chat CLI lets you quickly load the model and talk to it. Simply run the following:
 
-```bash
-trl chat --model_name_or_path  Qwen/Qwen1.5-0.5B-Chat 
-```
+<pre><code>$ trl chat --model_name_or_path Qwen/Qwen1.5-0.5B-Chat 
+<strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
+What is the best programming language?
 
-> [!TIP]
-> To use the chat CLI with the developer installation, you must run `make dev` 
->
+<strong><span style="color: blue;">&lt;Qwen/Qwen1.5-0.5B-Chat&gt;:</span></strong>
+There isn't a "best" programming language, as everyone has different style preferences, needs, and preferences. However, some people commonly use   
+languages like Python, Java, C++, and JavaScript, which are popular among developers for a variety of reasons, including readability, flexibility,  
+and scalability. Ultimately, it depends on personal preference, needs, and goals.
+</code></pre>
 
 Note that the chat interface relies on the tokenizer's [chat template](https://huggingface.co/docs/transformers/chat_templating) to format the inputs for the model. Make sure your tokenizer has a chat template defined.
 
 Besides talking to the model there are a few commands you can use:
 
-- **clear**: clears the current conversation and start a new one
-- **example {NAME}**: load example named `{NAME}` from the config and use it as the user input
-- **set {SETTING_NAME}={SETTING_VALUE};**: change the system prompt or generation settings (multiple settings are separated by a ';').
-- **reset**: same as clear but also resets the generation configs to defaults if they have been changed by **set**
-- **save {SAVE_NAME} (optional)**: save the current chat and settings to file by default to `./chat_history/{MODEL_NAME}/chat_{DATETIME}.yaml` or `{SAVE_NAME}` if provided
-- **exit**: closes the interface
+- `clear`: clears the current conversation and start a new one
+- `example {NAME}`: load example named `{NAME}` from the config and use it as the user input
+- `set {SETTING_NAME}={SETTING_VALUE};`: change the system prompt or generation settings (multiple settings are separated by a `;`).
+- `reset`: same as clear but also resets the generation configs to defaults if they have been changed by `set`
+- `save` or `save {SAVE_NAME}`: save the current chat and settings to file by default to `./chat_history/{MODEL_NAME}/chat_{DATETIME}.yaml` or `{SAVE_NAME}` if provided
+- `exit`: closes the interface
 
 The default examples are defined in `examples/scripts/config/default_chat_config.yaml` but you can pass your own with `--config CONFIG_FILE` where you can also specify the default generation parameters.
 

diff --git a/docs/source/cpo_trainer.mdx b/docs/source/cpo_trainer.mdx
@@ -1,6 +1,6 @@
 # CPO Trainer
 
-[![](https://img.shields.io/badge/All_models-CPO-blue)](https://huggingface.co/models?other=cpo)
+[![](https://img.shields.io/badge/All_models-CPO-blue)](https://huggingface.co/models?other=cpo,trl)
 
 ## Overview
 
@@ -32,7 +32,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
 
 training_args = CPOConfig(output_dir="Qwen2-0.5B-CPO", logging_steps=10)
-trainer = CPOTrainer(model=model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset)
+trainer = CPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
 trainer.train()
 ```
 

diff --git a/docs/source/ddpo_trainer.mdx b/docs/source/ddpo_trainer.mdx
@@ -1,6 +1,6 @@
 # Denoising Diffusion Policy Optimization
 
-[![](https://img.shields.io/badge/All_models-DDPO-blue)](https://huggingface.co/models?other=ddpo)
+[![](https://img.shields.io/badge/All_models-DDPO-blue)](https://huggingface.co/models?other=ddpo,trl)
 
 ## The why
 

diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
@@ -1,6 +1,6 @@
 # DPO Trainer
 
-[![](https://img.shields.io/badge/All_models-DPO-blue)](https://huggingface.co/models?other=dpo)
+[![](https://img.shields.io/badge/All_models-DPO-blue)](https://huggingface.co/models?other=dpo,trl)
 
 ## Overview
 
@@ -47,7 +47,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
 
 training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10)
-trainer = DPOTrainer(model=model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset)
+trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
 trainer.train()
 ```
 
@@ -100,8 +100,8 @@ Additionally, unlike standard text-based models where a `tokenizer` is used, for
       model,
       args=training_args,
       train_dataset=train_dataset,
--     tokenizer=tokenizer,
-+     tokenizer=processor,
+-     processing_class=tokenizer,
++     processing_class=processor,
 )
 ```
 
@@ -194,7 +194,7 @@ First install `unsloth` according to the [official documentation](https://github
 
 - training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10)
 + training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10, bf16=True)
-  trainer = DPOTrainer(model=model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset)
+  trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
   trainer.train()
 
 ```

diff --git a/docs/source/gkd_trainer.md b/docs/source/gkd_trainer.md
@@ -1,6 +1,6 @@
 # Generalized Knowledge Distillation Trainer
 
-[![](https://img.shields.io/badge/All_models-GKD-blue)](https://huggingface.co/models?other=gkd)
+[![](https://img.shields.io/badge/All_models-GKD-blue)](https://huggingface.co/models?other=gkd,trl)
 
 ## Overview
 
@@ -74,7 +74,7 @@ trainer = GKDTrainer(
     model=model,
     teacher_model=teacher_model,
     args=training_args,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     train_dataset=train_dataset,
     eval_dataset=eval_dataset,
 )

diff --git a/docs/source/iterative_sft_trainer.mdx b/docs/source/iterative_sft_trainer.mdx
@@ -1,6 +1,6 @@
 # Iterative Trainer
 
-[![](https://img.shields.io/badge/All_models-Iterative_SFT-blue)](https://huggingface.co/models?other=iterative-sft)
+[![](https://img.shields.io/badge/All_models-Iterative_SFT-blue)](https://huggingface.co/models?other=iterative-sft,trl)
 
 
 Iterative fine-tuning is a training method that enables to perform custom actions (generation and filtering for example) between optimization steps. In TRL we provide an easy-to-use API to fine-tune your models in an iterative way in just a few lines of code.

diff --git a/docs/source/kto_trainer.mdx b/docs/source/kto_trainer.mdx
@@ -1,6 +1,6 @@
 # KTO Trainer
 
-[![](https://img.shields.io/badge/All_models-KTO-blue)](https://huggingface.co/models?other=kto)
+[![](https://img.shields.io/badge/All_models-KTO-blue)](https://huggingface.co/models?other=kto,trl)
 
 TRL supports the Kahneman-Tversky Optimization (KTO) Trainer for aligning language models with binary feedback data (e.g., upvote/downvote), as described in the [paper](https://huggingface.co/papers/2402.01306) by Kawin Ethayarajh, Winnie Xu, Niklas Muennighoff, Dan Jurafsky, and Douwe Kiela.
 For a full example have a look at  [`examples/scripts/kto.py`].
@@ -89,7 +89,7 @@ kto_trainer = KTOTrainer(
     ref_model,
     args=training_args,
     train_dataset=train_dataset,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
 )
 ```
 After this one can then call:

diff --git a/docs/source/nash_md_trainer.md b/docs/source/nash_md_trainer.md
@@ -1,6 +1,6 @@
 # Nash-MD Trainer
 
-[![](https://img.shields.io/badge/All_models-Nash--MD-blue)](https://huggingface.co/models?other=nash-md)
+[![](https://img.shields.io/badge/All_models-Nash--MD-blue)](https://huggingface.co/models?other=nash-md,trl)
 
 ## Overview
 
@@ -41,7 +41,7 @@ trainer = NashMDTrainer(
     model=model,
     reward_model=reward_model,
     args=training_args,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     train_dataset=train_dataset,
 )
 trainer.train()

diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
@@ -1,6 +1,6 @@
 # Online DPO Trainer
 
-[![](https://img.shields.io/badge/All_models-Online_DPO-blue)](https://huggingface.co/models?other=online-dpo)
+[![](https://img.shields.io/badge/All_models-Online_DPO-blue)](https://huggingface.co/models?other=online-dpo,trl)
 
 ## Overview 
 
@@ -40,7 +40,7 @@ train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
 
 training_args = OnlineDPOConfig(output_dir="online-dpo-qwen2", logging_steps=10)
 trainer = OnlineDPOTrainer(
-    model=model, reward_model=reward_model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset
+    model=model, reward_model=reward_model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
 )
 trainer.train()
 ```

diff --git a/docs/source/orpo_trainer.md b/docs/source/orpo_trainer.md
@@ -1,6 +1,6 @@
 # ORPO Trainer
 
-[![](https://img.shields.io/badge/All_models-ORPO-blue)](https://huggingface.co/models?other=orpo)
+[![](https://img.shields.io/badge/All_models-ORPO-blue)](https://huggingface.co/models?other=orpo,trl)
 
 [Odds Ratio Preference Optimization](https://huggingface.co/papers/2403.07691) (ORPO) by Jiwoo Hong, Noah Lee, and James Thorne studies the crucial role of SFT within the context of preference alignment. Using preference data the method posits that a minor penalty for the disfavored generation together with a strong adaption signal to the chosen response via a simple log odds ratio term appended to the NLL loss is sufficient for preference-aligned SFT.
 
@@ -66,7 +66,7 @@ orpo_trainer = ORPOTrainer(
     model,
     args=training_args,
     train_dataset=train_dataset,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
 )
 ```
 After this one can then call:

diff --git a/docs/source/ppo_trainer.mdx b/docs/source/ppo_trainer.mdx
@@ -1,6 +1,6 @@
 # PPO Trainer
 
-[![](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo)
+[![](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)
 
 TRL supports the [PPO](https://huggingface.co/papers/1707.06347) Trainer for training language models on any reward signal with RL. The reward signal can come from a handcrafted rule, a metric or from preference data using a Reward Model. For a full example have a look at [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/lvwerra/trl/blob/main/examples/notebooks/gpt2-sentiment.ipynb). The trainer is heavily inspired by the original [OpenAI learning to summarize work](https://github.com/openai/summarize-from-feedback).
 

diff --git a/docs/source/ppov2_trainer.md b/docs/source/ppov2_trainer.md
@@ -1,6 +1,6 @@
 # PPOv2 Trainer
 
-[![](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo)
+[![](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)
 
 TRL supports training LLMs with [Proximal Policy Optimization (PPO)](https://huggingface.co/papers/1707.06347).
 

diff --git a/docs/source/reward_trainer.mdx b/docs/source/reward_trainer.mdx
@@ -1,6 +1,6 @@
 # Reward Modeling
 
-[![](https://img.shields.io/badge/All_models-Reward_Trainer-blue)](https://huggingface.co/models?other=reward-trainer)
+[![](https://img.shields.io/badge/All_models-Reward_Trainer-blue)](https://huggingface.co/models?other=reward-trainer,trl)
 
 TRL supports custom reward modeling for anyone to perform reward modeling on their dataset and model.
 
@@ -41,7 +41,7 @@ peft_config = LoraConfig(
 trainer = RewardTrainer(
     model=model,
     args=training_args,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     train_dataset=dataset,
     peft_config=peft_config,
 )

diff --git a/docs/source/rloo_trainer.md b/docs/source/rloo_trainer.md
@@ -1,6 +1,6 @@
 # RLOO Trainer
 
-[![](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo)
+[![](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo,trl)
 
 TRL supports training LLMs with REINFORCE Leave-One-Out (RLOO). The idea is that instead of using a value function, RLOO generates K completions for each prompt. For each completion, RLOO uses the mean scores from the other K-1 completions as a baseline to calculate the advantage. RLOO also models the entire completion as a single action, where as PPO models each token as an action. Note that REINFORCE / A2C is a special case of PPO, when the number of PPO epochs is 1 and the number of mini-batches is 1, which is how we implement RLOO in TRL.
 

diff --git a/docs/source/sft_trainer.mdx b/docs/source/sft_trainer.mdx
@@ -1,6 +1,6 @@
 # Supervised Fine-tuning Trainer
 
-[![](https://img.shields.io/badge/All_models-SFT-blue)](https://huggingface.co/models?other=sft)
+[![](https://img.shields.io/badge/All_models-SFT-blue)](https://huggingface.co/models?other=sft,trl)
 
 Supervised fine-tuning (or SFT for short) is a crucial step in RLHF. In TRL we provide an easy-to-use API to create your SFT models and train them with few lines of code on your dataset.
 
@@ -744,7 +744,7 @@ trainer = SFTTrainer(
     args=training_args,
     data_collator=collate_fn,
     train_dataset=train_dataset,
-    tokenizer=processor.tokenizer,
+    processing_class=processor.tokenizer,
 )
 ```
 

diff --git a/docs/source/xpo_trainer.mdx b/docs/source/xpo_trainer.mdx
@@ -1,6 +1,6 @@
 # XPO Trainer
 
-[![](https://img.shields.io/badge/All_models-XPO-blue)](https://huggingface.co/models?other=xpo)
+[![](https://img.shields.io/badge/All_models-XPO-blue)](https://huggingface.co/models?other=xpo,trl)
 
 ## Overview
 
@@ -41,7 +41,7 @@ trainer = XPOTrainer(
     model=model,
     reward_model=reward_model,
     args=training_args,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     train_dataset=train_dataset,
 )
 trainer.train()

diff --git a/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py b/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
@@ -237,7 +237,7 @@ def return_prompt_and_responses(samples) -> Dict[str, str]:
         beta=script_args.beta,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         peft_config=peft_config,
         max_prompt_length=script_args.max_prompt_length,
         max_length=script_args.max_length,

diff --git a/examples/research_projects/stack_llama_2/scripts/sft_llama2.py b/examples/research_projects/stack_llama_2/scripts/sft_llama2.py
@@ -187,7 +187,7 @@ def create_datasets(tokenizer, args, seed=None):
     peft_config=peft_config,
     max_seq_length=None,
     formatting_func=prepare_sample_text,
-    tokenizer=tokenizer,
+    processing_class=tokenizer,
     args=training_args,
 )
 trainer.train()

diff --git a/examples/scripts/bco.py b/examples/scripts/bco.py
@@ -152,7 +152,7 @@ def mean_pooling(model_output, attention_mask):
         args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         peft_config=get_peft_config(model_args),
         embedding_func=embedding_func,
         embedding_tokenizer=embedding_tokenizer,

diff --git a/examples/scripts/cpo.py b/examples/scripts/cpo.py
@@ -100,7 +100,7 @@ class ScriptArguments:
         args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         peft_config=get_peft_config(model_config),
     )
 

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
@@ -121,7 +121,7 @@
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split],
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         peft_config=peft_config,
     )
 

diff --git a/examples/scripts/dpo_online.py b/examples/scripts/dpo_online.py
@@ -107,7 +107,7 @@
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split],
-        tokenizer=tokenizer,
+        processing_class=tokenizer,
         peft_config=get_peft_config(model_config),
     )
     generation_config = GenerationConfig(

diff --git a/examples/scripts/dpo_vlm.py b/examples/scripts/dpo_vlm.py
@@ -126,7 +126,7 @@ def process(row):
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split],
-        tokenizer=processor,
+        processing_class=processor,
         peft_config=peft_config,
     )