🎲 Move random judges in testing utilities (#2365)

* Update judges and testing utilities * Update judges in test files * Update judges in test files
huggingface · Nov 18, 2024 · b80c1a6 · b80c1a6
1 parent b5eabbe
commit b80c1a6
Show file tree

Hide file tree

Showing 10 changed files with 56 additions and 109 deletions.
diff --git a/docs/source/judges.mdx b/docs/source/judges.mdx
@@ -11,7 +11,7 @@ TRL provides judges to easily compare two completions.
 Make sure to have installed the required dependencies by running:
 
 ```bash
-pip install trl[llm_judge]
+pip install trl[judges]
 ```
 
 ## Using the provided judges
@@ -52,46 +52,38 @@ judge.judge(
 )  # Outputs: [0, 1]
 ```
 
-## AllTrueJudge
+## Provided judges
 
-[[autodoc]] AllTrueJudge
-
-## BaseJudge
-
-[[autodoc]] BaseJudge
-
-## BaseBinaryJudge
+### PairRMJudge
 
-[[autodoc]] BaseBinaryJudge
+[[autodoc]] PairRMJudge
 
-## BaseRankJudge
+### HfPairwiseJudge
 
-[[autodoc]] BaseRankJudge
-
-## BasePairwiseJudge
+[[autodoc]] HfPairwiseJudge
 
-[[autodoc]] BasePairwiseJudge
+### OpenAIPairwiseJudge
 
-## RandomBinaryJudge
+[[autodoc]] OpenAIPairwiseJudge
 
-[[autodoc]] RandomBinaryJudge
+### AllTrueJudge
 
-## RandomRankJudge
+[[autodoc]] AllTrueJudge
 
-[[autodoc]] RandomRankJudge
+## Base classes
 
-## RandomPairwiseJudge
+### BaseJudge
 
-[[autodoc]] RandomPairwiseJudge
+[[autodoc]] BaseJudge
 
-## PairRMJudge
+### BaseBinaryJudge
 
-[[autodoc]] PairRMJudge
+[[autodoc]] BaseBinaryJudge
 
-## HfPairwiseJudge
+### BaseRankJudge
 
-[[autodoc]] HfPairwiseJudge
+[[autodoc]] BaseRankJudge
 
-## OpenAIPairwiseJudge
+### BasePairwiseJudge
 
-[[autodoc]] OpenAIPairwiseJudge
+[[autodoc]] BasePairwiseJudge
diff --git a/setup.py b/setup.py
@@ -87,7 +87,7 @@
     "diffusers": ["diffusers>=0.18.0"],
     # liger-kernel depends on triton, which is only available on Linux https://github.com/triton-lang/triton#compatibility
     "liger": ["liger-kernel>=0.4.0; sys_platform != 'win32'"],
-    "llm_judge": ["openai>=1.23.2", "llm-blender>=0.0.2"],
+    "judges": ["openai>=1.23.2", "llm-blender>=0.0.2"],
     "peft": ["peft>=0.8.0"],
     "quantization": ["bitsandbytes"],
     "scikit": ["scikit-learn"],

diff --git a/tests/test_judges.py b/tests/test_judges.py
@@ -15,16 +15,9 @@
 import time
 import unittest
 
-from trl import (
-    AllTrueJudge,
-    HfPairwiseJudge,
-    PairRMJudge,
-    RandomBinaryJudge,
-    RandomPairwiseJudge,
-    RandomRankJudge,
-)
+from trl import AllTrueJudge, HfPairwiseJudge, PairRMJudge
 
-from .testing_utils import require_llm_blender
+from .testing_utils import RandomBinaryJudge, require_llm_blender
 
 
 class TestJudges(unittest.TestCase):
@@ -45,28 +38,6 @@ def test_all_true_judge(self):
         self.assertEqual(len(judgements), 2)
         self.assertTrue(all(judgement in {0, 1, -1} for judgement in judgements))
 
-    def test_random_binary_judge(self):
-        judge = RandomBinaryJudge()
-        prompts, completions = self._get_prompts_and_single_completions()
-        judgements = judge.judge(prompts=prompts, completions=completions)
-        self.assertEqual(len(judgements), 2)
-        self.assertTrue(all(judgement in {0, 1, -1} for judgement in judgements))
-
-    def test_random_pairwise_judge(self):
-        judge = RandomPairwiseJudge()
-        prompts, completions = self._get_prompts_and_pairwise_completions()
-        ranks = judge.judge(prompts=prompts, completions=completions)
-        self.assertEqual(len(ranks), 2)
-        self.assertTrue(all(isinstance(rank, int) for rank in ranks))
-
-    def test_random_rank_judge(self):
-        judge = RandomRankJudge()
-        prompts, completions = self._get_prompts_and_pairwise_completions()
-        ranks = judge.judge(prompts=prompts, completions=completions)
-        self.assertEqual(len(ranks), 2)
-        self.assertTrue(all(isinstance(rank, list) for rank in ranks))
-        self.assertTrue(all(all(isinstance(rank, int) for rank in ranks) for ranks in ranks))
-
     @unittest.skip("This test needs to be run manually since it requires a valid Hugging Face API key.")
     def test_hugging_face_judge(self):
         judge = HfPairwiseJudge()
@@ -84,6 +55,7 @@ def load_pair_rm_judge(self):
                 return PairRMJudge()
             except ValueError:
                 time.sleep(5)
+        raise ValueError("Failed to load PairRMJudge")
 
     @require_llm_blender
     def test_pair_rm_judge(self):

diff --git a/tests/test_nash_md_trainer.py b/tests/test_nash_md_trainer.py
@@ -20,9 +20,9 @@
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
 
-from trl import NashMDConfig, NashMDTrainer, PairRMJudge
+from trl import NashMDConfig, NashMDTrainer
 
-from .testing_utils import require_llm_blender
+from .testing_utils import RandomPairwiseJudge, require_llm_blender
 
 
 if is_peft_available():
@@ -174,7 +174,7 @@ def test_nash_md_trainer_judge_training(self, config_name):
                 report_to="none",
             )
             dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
-            judge = PairRMJudge()
+            judge = RandomPairwiseJudge()
 
             trainer = NashMDTrainer(
                 model=self.model,

diff --git a/tests/test_online_dpo_trainer.py b/tests/test_online_dpo_trainer.py
@@ -20,9 +20,11 @@
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
 
-from trl import OnlineDPOConfig, OnlineDPOTrainer, RandomPairwiseJudge, is_llm_blender_available
+from trl import OnlineDPOConfig, OnlineDPOTrainer, is_llm_blender_available
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
+from .testing_utils import RandomPairwiseJudge
+
 
 if is_peft_available():
     from peft import LoraConfig, get_peft_model

diff --git a/tests/test_xpo_trainer.py b/tests/test_xpo_trainer.py
@@ -20,7 +20,9 @@
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
 
-from trl import RandomPairwiseJudge, XPOConfig, XPOTrainer, is_llm_blender_available
+from trl import XPOConfig, XPOTrainer, is_llm_blender_available
+
+from .testing_utils import RandomPairwiseJudge
 
 
 if is_peft_available():

diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -11,11 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import random
 import unittest
 
 from transformers import is_sklearn_available, is_wandb_available
 
-from trl import is_diffusers_available, is_llm_blender_available
+from trl import BaseBinaryJudge, BasePairwiseJudge, is_diffusers_available, is_llm_blender_available
 
 
 def require_diffusers(test_case):
@@ -44,3 +45,24 @@ def require_llm_blender(test_case):
     Decorator marking a test that requires llm-blender. Skips the test if llm-blender is not available.
     """
     return unittest.skipUnless(is_llm_blender_available(), "test requires llm-blender")(test_case)
+
+
+class RandomBinaryJudge(BaseBinaryJudge):
+    """
+    Random binary judge, for testing purposes.
+    """
+
+    def judge(self, prompts, completions, gold_completions=None, shuffle_order=True):
+        return [random.choice([0, 1, -1]) for _ in range(len(prompts))]
+
+
+class RandomPairwiseJudge(BasePairwiseJudge):
+    """
+    Random pairwise judge, for testing purposes.
+    """
+
+    def judge(self, prompts, completions, shuffle_order=True, return_scores=False):
+        if not return_scores:
+            return [random.randint(0, len(completion) - 1) for completion in completions]
+        else:
+            return [random.random() for _ in range(len(prompts))]
diff --git a/trl/__init__.py b/trl/__init__.py
@@ -50,8 +50,8 @@
         "AlignPropConfig",
         "AlignPropTrainer",
         "AllTrueJudge",
-        "BaseJudge",
         "BaseBinaryJudge",
+        "BaseJudge",
         "BasePairwiseJudge",
         "BaseRankJudge",
         "BCOConfig",
@@ -81,9 +81,6 @@
         "PairRMJudge",
         "PPOConfig",
         "PPOTrainer",
-        "RandomBinaryJudge",
-        "RandomPairwiseJudge",
-        "RandomRankJudge",
         "RewardConfig",
         "RewardTrainer",
         "RLOOConfig",
@@ -173,9 +170,6 @@
         PairRMJudge,
         PPOConfig,
         PPOTrainer,
-        RandomBinaryJudge,
-        RandomPairwiseJudge,
-        RandomRankJudge,
         RewardConfig,
         RewardTrainer,
         RLOOConfig,

diff --git a/trl/trainer/__init__.py b/trl/trainer/__init__.py
@@ -42,9 +42,6 @@
         "HfPairwiseJudge",
         "OpenAIPairwiseJudge",
         "PairRMJudge",
-        "RandomBinaryJudge",
-        "RandomPairwiseJudge",
-        "RandomRankJudge",
     ],
     "kto_config": ["KTOConfig"],
     "kto_trainer": ["KTOTrainer"],
@@ -109,9 +106,6 @@
         HfPairwiseJudge,
         OpenAIPairwiseJudge,
         PairRMJudge,
-        RandomBinaryJudge,
-        RandomPairwiseJudge,
-        RandomRankJudge,
     )
     from .kto_config import KTOConfig
     from .kto_trainer import KTOTrainer
@@ -124,8 +118,6 @@
     from .orpo_trainer import ORPOTrainer
     from .ppo_config import PPOConfig
     from .ppo_trainer import PPOTrainer
-    from .ppov2_config import PPOv2Config
-    from .ppov2_trainer import PPOv2Trainer
     from .reward_config import RewardConfig
     from .reward_trainer import RewardTrainer, compute_accuracy
     from .rloo_config import RLOOConfig

diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py
@@ -14,7 +14,6 @@
 
 import concurrent.futures
 import logging
-import random
 from abc import ABC, abstractmethod
 from typing import List, Optional, Union
 
@@ -183,34 +182,6 @@ def judge(
         raise NotImplementedError("Judge subclasses must implement the `judge` method.")
 
 
-class RandomBinaryJudge(BaseBinaryJudge):
-    """
-    Random binary judge, for testing purposes.
-    """
-
-    def judge(self, prompts, completions, gold_completions=None, shuffle_order=True):
-        return [random.choice([0, 1, -1]) for _ in range(len(prompts))]
-
-
-class RandomRankJudge(BaseRankJudge):
-    """
-    Random rank, for testing purposes.
-    """
-
-    def judge(self, prompts, completions, shuffle_order=True):
-        num_completions = [len(completions[i]) for i in range(len(prompts))]
-        return [random.sample(range(n), n) for n in num_completions]
-
-
-class RandomPairwiseJudge(BasePairwiseJudge):
-    """
-    Random pairwise judge, for testing purposes.
-    """
-
-    def judge(self, prompts, completions, shuffle_order=True):
-        return [random.randint(0, len(completion) - 1) for completion in completions]
-
-
 class PairRMJudge(BasePairwiseJudge):
     """
     LLM judge based on the PairRM model from AllenAI.