Merge branch 'main' into feature-chat-models

huggingface · Aug 27, 2024 · f881dc3 · f881dc3
2 parents b44362b + 24adaa2
commit f881dc3
Show file tree

Hide file tree

Showing 46 changed files with 1,325 additions and 1,211 deletions.
diff --git a/README.md b/README.md
@@ -146,6 +146,18 @@ accelerate launch --multi_gpu --num_processes=<num_gpus> -m \
 
 You can find the template of the expected model configuration in [examples/model_configs/base_model.yaml_](./examples/model_configs/base_model.yaml).
 
+### Evaluating a quantized model
+
+If you want to evaluate a model by quantizing it, then the model can be loaded in `4bit` or `8bit`. Implicitly, this makes use of `BitsAndBytesConfig` and can drastically reduce memory requirements for consumer-grade hardware.
+
+An example configuration can be found in [examples/model_configs/quantized_model.yaml](./examples/model_configs/quantized_model.yaml).
+
+### Evaluating a PEFT model
+
+If you want to evaluate a model trained with `peft`, check out [examples/model_configs/peft_model.yaml](./examples/model_configs/peft_model.yaml).
+
+Currently, `lighteval` supports `adapter` and `delta` weights to be applied to the base model.
+
 ### Evaluating a large model with pipeline parallelism
 
 To evaluate models larger that ~40B parameters in 16-bit precision, you will need to shard the model across multiple GPUs to fit it in VRAM. You can do this by passing `model_parallel=True` and adapting `--num_processes` to be the number of processes to use for data parallel. For example, on a single node of 8 GPUs, you can run:
@@ -480,6 +492,12 @@ export CUDA_LAUNCH_BLOCKING=1
 srun accelerate launch --multi_gpu --num_processes=8 -m lighteval accelerate --model_args "pretrained=your model name" --tasks examples/tasks/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir=your output dir
 ```
 
+## Authentication
+
+For authentication of HuggingFace models (i.e `base` models), a HuggingFace token is used. The `HF_TOKEN` used is picked up directly from the environment.
+
+For `tgi` models, authentication is provided in the config file. An example can be found at [tgi_model.yaml](./examples/model_configs/tgi_model.yaml).
+
 ## Releases
 
 ### Building the package
@@ -498,4 +516,4 @@ python3 -m build .
   version = {0.3.0},
   url = {https://github.com/huggingface/lighteval}
 }
-```
+```
diff --git a/community_tasks/_template.py b/community_tasks/_template.py
@@ -34,9 +34,9 @@
 from lighteval.metrics import Metrics
 from lighteval.metrics.metrics import SampleLevelMetric
 from lighteval.metrics.utils import MetricCategory, MetricUseCase
+from lighteval.tasks.default_prompts import LETTER_INDICES
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
 
 # DEFINE YOUR PROMPT FUNCTIONS

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
@@ -30,9 +30,9 @@
 import re
 
 from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.default_prompts import LETTER_INDICES
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
 
 # fmt: off

diff --git a/examples/model_configs/peft_model.yaml b/examples/model_configs/peft_model.yaml
@@ -0,0 +1,12 @@
+model:
+  type: "base" 
+  base_params:
+    model_args: "pretrained=predibase/customer_support,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied.
+    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. 
+    compile: true
+  merged_weights: # Ignore this section if you are not using PEFT models
+    delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
+    adapter_weights: true # set to True of your model has been trained with peft, also need to provide the base model name
+    base_model: "mistralai/Mistral-7B-v0.1" # path to the base_model - needs to be specified only if delta_weights or adapter_weights is set to True
+  generation:
+    multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing
diff --git a/examples/model_configs/quantized_model.yaml b/examples/model_configs/quantized_model.yaml
@@ -0,0 +1,12 @@
+model:
+  type: "base" 
+  base_params:
+    model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
+    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. 
+    compile: true
+  merged_weights: # Ignore this section if you are not using PEFT models
+    delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
+    adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name
+    base_model: null # path to the base_model - needs to be specified only if delta_weights or adapter_weights is set to True
+  generation:
+    multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing
diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py
@@ -30,11 +30,11 @@
 from dataclasses import asdict
 from typing import Dict, List, Tuple
 
-import lighteval.tasks.tasks_prompt_formatting as prompt
+import lighteval.tasks.default_prompts as prompt
 from lighteval.metrics import Metrics
+from lighteval.tasks.default_prompts import LETTER_INDICES
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
 
 _TASKS_STRINGS: List[Tuple[LightevalTaskConfig, str]] = []

diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
@@ -47,7 +47,7 @@ def cli_evaluate():
     parser_nanotron(parser_b)
 
     # Subparser for task utils functions
-    parser_c = subparsers.add_parser("tasks", help="use nanotron as backend for evaluation.")
+    parser_c = subparsers.add_parser("tasks", help="display information about available tasks and samples.")
     parser_utils_tasks(parser_c)
 
     args = parser.parse_args()
@@ -69,7 +69,7 @@ def cli_evaluate():
         if args.inspect:
             print(f"Loading the tasks dataset to cache folder: {args.cache_dir}")
             print(
-                "All examples will be displayed without few shot, as few shot sample construction requires loading a model and using its tokenizer."
+                "All examples will be displayed without few shot, as few shot sample construction requires loading a model and using its tokenizer. "
             )
             # Loading task
             task_names_list, _ = taskinfo_selector(args.inspect)
@@ -78,7 +78,7 @@ def cli_evaluate():
                 print("-" * 10, name, "-" * 10)
                 if args.show_config:
                     print("-" * 10, "CONFIG")
-                    task.print_config()
+                    task.cfg.print()
                 for ix, sample in enumerate(task.eval_docs()[: int(args.num_samples)]):
                     if ix == 0:
                         print("-" * 10, "SAMPLES")

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -30,6 +30,7 @@
 from enum import Enum
 from pathlib import Path
 
+import torch
 from datasets import Dataset, load_dataset
 from datasets.utils.metadata import MetadataConfigs
 from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HFSummaryWriter, hf_hub_url
@@ -42,7 +43,8 @@
     TaskConfigLogger,
     VersionsLogger,
 )
-from lighteval.utils import NO_TENSORBOARDX_WARN_MSG, is_nanotron_available, is_tensorboardX_available, obj_to_markdown
+from lighteval.utils.imports import NO_TENSORBOARDX_WARN_MSG, is_nanotron_available, is_tensorboardX_available
+from lighteval.utils.utils import obj_to_markdown
 
 
 if is_nanotron_available():
@@ -63,6 +65,8 @@ def default(self, o):
                 return str(o)
         if callable(o):
             return o.__name__
+        if isinstance(o, torch.dtype):
+            return str(o)
         if isinstance(o, Enum):
             return o.name
         return super().default(o)
@@ -167,6 +171,8 @@ def save(self) -> None:
 
         config_general = copy.deepcopy(self.general_config_logger)
         config_general = asdict(config_general)
+        # We remove the config from logging, which contains context/accelerator objects
+        config_general.pop("config")
 
         to_dump = {
             "config_general": config_general,

diff --git a/src/lighteval/logging/hierarchical_logger.py b/src/lighteval/logging/hierarchical_logger.py
@@ -26,7 +26,7 @@
 from logging import Logger
 from typing import Any, Callable
 
-from lighteval.utils import is_accelerate_available, is_nanotron_available
+from lighteval.utils.imports import is_accelerate_available, is_nanotron_available
 
 
 if is_nanotron_available():