Skip to content

Commit

Permalink
Fix chat template; fix leaderboard math (EleutherAI#2475)
Browse files Browse the repository at this point in the history
* batch commit

* :Revert "batch commit"

This reverts commit d859d1c.

* batch commit

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* checkout from main

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* cleanup

* Chat template fix (OpenGPTX#7)

* cleanup

* cleanup

* cleanup

* linting

* fix tests

* add ifeval install to new_task CI

* Revert "add ifeval install to new_task CI"

This reverts commit 1d19449.

* adds leaderboard tasks (#1)

* adds leaderboard tasks

* Delete lm_eval/tasks/leaderboard/leaderboard_chat_template.yaml

* add readme

* Delete lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro_chat_template.yaml

* modify readme

* fix bbh task

* fix bbh salient task

* modify the readme

* Delete lm_eval/tasks/leaderboard/ifeval/README.md

* Delete lm_eval/tasks/leaderboard/math/README.md

* add leaderboard to the tasks repertory

* add anouncment about new leaderbaord tasks

* linting

* Update README.md

Co-authored-by: Hailey Schoelkopf <[email protected]>

* installs ifeval dependency in new_task github workflow

---------

Co-authored-by: Nathan Habib <[email protected]>
Co-authored-by: Hailey Schoelkopf <[email protected]>

* fix math parser

* fix math parser

* fix version

* add warning about chat template

---------

Co-authored-by: Nathan Habib <[email protected]>
Co-authored-by: Nathan Habib <[email protected]>
Co-authored-by: Nathan Habib <[email protected]>
Co-authored-by: Hailey Schoelkopf <[email protected]>
Co-authored-by: Nathan Habib <[email protected]>
  • Loading branch information
6 people authored Nov 11, 2024
1 parent bd80a6c commit 77c811e
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 16 deletions.
26 changes: 26 additions & 0 deletions docs/chat-template-readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Chat Template Delimiter Handling Update

## Overview
This change modifies how delimiters are handled when applying chat templates in the request construction process for likelihood and multiple-choice based tasks. When `apply_chat_template` is set to `True`, the target delimiter is now set to an empty string instead of using the configured delimiter.

## Background
By default, the system uses a target delimiter (typically a whitespace " ") between the context and target text when constructing prompts. The full string is constructed as:
```
doc_to_text(doc) + target_delimiter + doc_to_target(doc)
```

While this worked well for base models where we wanted the model to predict a single whitespace followed by the answer, chat models have their own formatting conventions that handle spacing differently.

## The Change
- When `apply_chat_template=True`, the target delimiter is now empty ("") instead of the default whitespace
- This prevents interference between chat template formatting and the default delimiter system
- Particularly important for multiple choice tasks where the template itself handles spacing

## Example
```
# Before (with default delimiter " ")
<user>Question: What color is the sky?\nAnswer:<assistant> blue
# After
<user>Question: What color is the sky?\nAnswer:<assistant>blue
```
5 changes: 5 additions & 0 deletions lm_eval/api/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ def build_all_requests(
doc=doc,
ctx=fewshot_ctx,
metadata=(self.config["task"], doc_id, self.config.repeats),
apply_chat_template=apply_chat_template,
)

if not isinstance(inst, list):
Expand Down Expand Up @@ -1301,6 +1302,8 @@ def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
def construct_requests(
self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]:
apply_chat_template = kwargs.pop("apply_chat_template", False)

aux_arguments = None

if self.OUTPUT_TYPE == "loglikelihood":
Expand All @@ -1310,6 +1313,8 @@ def construct_requests(
elif self.OUTPUT_TYPE == "multiple_choice":
choices = self.doc_to_choice(doc)
target_delimiter = self.config.target_delimiter
if apply_chat_template:
target_delimiter = ""
if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc)
Expand Down
5 changes: 5 additions & 0 deletions lm_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,11 @@ def evaluate(

eval_logger.setLevel(getattr(logging, f"{verbosity}"))

if apply_chat_template:
eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
)

# tracks all Instances/requests a model must generate output on.
requests = defaultdict(list)
# stores the amount to pad out reqs per req. type so that
Expand Down
18 changes: 15 additions & 3 deletions lm_eval/models/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple, Union

import jinja2
import torch
import torch.nn.functional as F
import transformers
Expand Down Expand Up @@ -1344,9 +1345,20 @@ def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
try:
chat_templated = self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
except jinja2.exceptions.TemplateError:
eval_logger.warning(
"Failed to apply chat template. removing the system role in chat history."
)
chat_history = [msg for msg in chat_history if msg["role"] != "system"]
chat_templated = self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)

return chat_templated

def get_model_info(self) -> dict:
"""
Expand Down
2 changes: 1 addition & 1 deletion lm_eval/tasks/leaderboard/math/_template_yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ metric_list:
higher_is_better: true
num_fewshot: 4
metadata:
version: 1.0
version: 2.0
dataset_kwargs:
trust_remote_code: true
fewshot_config:
Expand Down
31 changes: 19 additions & 12 deletions lm_eval/tasks/leaderboard/math/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
)


INVALID_ANSWER = "[invalidanswer]"


# taken from
# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
def doc_to_text(doc: dict) -> str:
Expand Down Expand Up @@ -70,7 +73,10 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
unnormalized_answer = get_unnormalized_answer(candidates)
answer = normalize_final_answer(unnormalized_answer)

if is_equiv(answer, doc["answer"]):
if answer == INVALID_ANSWER:
return {"exact_match": 0}

if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
retval = 1
else:
retval = 0
Expand Down Expand Up @@ -112,17 +118,19 @@ def last_boxed_only_string(string: str) -> Optional[str]:


def remove_boxed(s: str) -> str:
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]

left = "\\boxed{"
try:
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]

assert s[: len(left)] == left
assert s[-1] == "}"
left = "\\boxed{"

return s[len(left) : -1]
assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
except AssertionError:
return INVALID_ANSWER


class timeout:
Expand All @@ -146,7 +154,7 @@ def is_equiv(x1: str, x2: str) -> bool:
x1 and x2 are normalized latex string
"""
try:
with timeout(seconds=5):
with timeout(seconds=1):
try:
parsed_x1 = parse_latex(x1)
parsed_x2 = parse_latex(x2)
Expand Down Expand Up @@ -185,7 +193,6 @@ def is_equiv(x1: str, x2: str) -> bool:


def get_unnormalized_answer(text: str) -> str:
INVALID_ANSWER = "[invalidanswer]"
end_seq = "I hope it is correct."
text += end_seq
match = re.search(
Expand Down

0 comments on commit 77c811e

Please sign in to comment.