diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index d01c407..69c1f55 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -3,13 +3,11 @@ from timeit import default_timer as timer from typing import Dict, List, Tuple -import nltk import torch import typer -from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu -from nltk.translate.meteor_score import single_meteor_score from autora.doc.classes.EvalResult import EvalResult +from autora.doc.pipelines.metrics import eval_bleu_meteor from autora.doc.runtime.predict_hf import Predictor from autora.doc.runtime.prompts import PROMPTS, PromptIds from autora.doc.util import get_prompts_from_file @@ -22,33 +20,6 @@ logger = logging.getLogger(__name__) -def evaluate_documentation(predictions: List[str], references: List[str]) -> Tuple[float, float]: - nltk.download("wordnet") - - # Tokenize references - tokenized_references = [ref.split() for ref in references] - # Currently there is only 1 prediction for 1 reference, need to avg in future - tokenized_predictions = [pred.split() if pred else [] for pred in predictions] - - # Calculate BLEU score with smoothing function - # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference. - bleu = corpus_bleu( - # Wrap each reference list in another list - [[tokenized_ref] for tokenized_ref in tokenized_references], - tokenized_predictions, - smoothing_function=SmoothingFunction().method1, - ) - - # Calculate METEOR scores - meteor_scores = [ - single_meteor_score(tokenized_ref, tokenized_pred) - for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions) - ] - meteor = sum(meteor_scores) / len(predictions) if predictions else 0 - - return (bleu, meteor) - - @app.command(help="Evaluate a model for code-to-documentation generation for all prompts in the prompts_file") def eval_prompts( data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"), @@ -143,7 +114,7 @@ def eval_prompt( timer_start = timer() predictions = pred.predict(prompt, inputs, **param_dict) timer_end = timer() - bleu, meteor = evaluate_documentation(predictions, labels) + bleu, meteor = eval_bleu_meteor(predictions, labels) pred_time = timer_end - timer_start mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs))) for i in range(len(inputs)): diff --git a/src/autora/doc/pipelines/metrics.py b/src/autora/doc/pipelines/metrics.py new file mode 100644 index 0000000..92bdf4c --- /dev/null +++ b/src/autora/doc/pipelines/metrics.py @@ -0,0 +1,32 @@ +from typing import List, Tuple + +import nltk +from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu +from nltk.translate.meteor_score import single_meteor_score + + +def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[float, float]: + nltk.download("wordnet") + + # Tokenize references + tokenized_references = [ref.split() for ref in references] + # Currently there is only 1 prediction for 1 reference, need to avg in future + tokenized_predictions = [pred.split() if pred else [] for pred in predictions] + + # Calculate BLEU score with smoothing function + # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference. + bleu = corpus_bleu( + # Wrap each reference list in another list + [[tokenized_ref] for tokenized_ref in tokenized_references], + tokenized_predictions, + smoothing_function=SmoothingFunction().method1, + ) + + # Calculate METEOR scores + meteor_scores = [ + single_meteor_score(tokenized_ref, tokenized_pred) + for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions) + ] + meteor = sum(meteor_scores) / len(predictions) if predictions else 0 + + return (bleu, meteor) diff --git a/tests/test_main.py b/tests/test_main.py index 2437be1..13be933 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,11 +1,8 @@ from pathlib import Path -from typing import Dict, List - -import jsonlines -import pytest +from typing import List from autora.doc.classes.EvalResult import EvalResult -from autora.doc.pipelines.main import eval, eval_prompts, evaluate_documentation, generate, import_data +from autora.doc.pipelines.main import eval, eval_prompts, generate, import_data from autora.doc.runtime.prompts import PromptIds # dummy HF model for testing @@ -20,55 +17,6 @@ def test_predict() -> None: assert len(output) > 0, "Expected non-empty output" -def test_evaluation() -> None: - # Test Case: Meteor and Bleu scores are close to 1 - data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve() - with jsonlines.open(data) as reader: - items = [item for item in reader] - labels = [item["output"] for item in items] - predictions = [item["output"] for item in items] - - bleu, meteor = evaluate_documentation(predictions, labels) - assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}" - assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}" - - -def test_extra_token_in_prediction() -> None: - # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes - labels = ["this is a test"] - predictions = ["this is a test extra"] - bleu, meteor = evaluate_documentation(predictions, labels) - assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}" - assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}" - - -def test_missing_token_in_prediction() -> None: - # bleu score is less, meteor is higher - labels = ["this is a test"] - predictions = ["this is a"] - bleu, meteor = evaluate_documentation(predictions, labels) - assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}" - assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}" - - -def test_completely_different_tokens() -> None: - # both scores are less, as no common tokens - labels = ["this is a test"] - predictions = ["completely different sentence"] - bleu, meteor = evaluate_documentation(predictions, labels) - assert bleu <= 0.1, f"BLEU Score is {bleu}" - assert meteor <= 0.1, f"METEOR Score is {meteor}" - - -def test_partially_matching_tokens() -> None: - # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only. - labels = ["this is a test"] - predictions = ["this is a different test"] - bleu, meteor = evaluate_documentation(predictions, labels) - assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}" - assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}" - - def test_generate() -> None: python_file = __file__ output = Path("output.txt") diff --git a/tests/test_metrics.py b/tests/test_metrics.py new file mode 100644 index 0000000..693b93a --- /dev/null +++ b/tests/test_metrics.py @@ -0,0 +1,55 @@ +from pathlib import Path + +import jsonlines +import pytest + +from autora.doc.pipelines.metrics import eval_bleu_meteor + + +def test_evaluation() -> None: + # Test Case: Meteor and Bleu scores are close to 1 + data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve() + with jsonlines.open(data) as reader: + items = [item for item in reader] + labels = [item["output"] for item in items] + predictions = [item["output"] for item in items] + + bleu, meteor = eval_bleu_meteor(predictions, labels) + assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}" + assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}" + + +def test_extra_token_in_prediction() -> None: + # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes + labels = ["this is a test"] + predictions = ["this is a test extra"] + bleu, meteor = eval_bleu_meteor(predictions, labels) + assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}" + assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}" + + +def test_missing_token_in_prediction() -> None: + # bleu score is less, meteor is higher + labels = ["this is a test"] + predictions = ["this is a"] + bleu, meteor = eval_bleu_meteor(predictions, labels) + assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}" + assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}" + + +def test_completely_different_tokens() -> None: + # both scores are less, as no common tokens + labels = ["this is a test"] + predictions = ["completely different sentence"] + bleu, meteor = eval_bleu_meteor(predictions, labels) + assert bleu <= 0.1, f"BLEU Score is {bleu}" + assert meteor <= 0.1, f"METEOR Score is {meteor}" + + +def test_partially_matching_tokens() -> None: + # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only. + labels = ["this is a test"] + predictions = ["this is a different test"] + bleu, meteor = eval_bleu_meteor(predictions, labels) + assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}" + assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}"