diff --git a/.mypy.ini b/.mypy.ini index d9e2214..6e719c2 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -8,3 +8,6 @@ ignore_missing_imports = True [mypy-mlflow.*] ignore_missing_imports = True + +[mypy-nltk.*] +ignore_missing_imports = True diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc58e8e..fe08b75 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -78,7 +78,7 @@ repos: # supported by your project here, or alternatively use # pre-commit's default_language_version, see # https://pre-commit.com/#top_level-default_language_version - language_version: python3.10 + language_version: python3 diff --git a/azureml/conda.yml b/azureml/conda.yml index fab3656..fa4aa98 100644 --- a/azureml/conda.yml +++ b/azureml/conda.yml @@ -14,5 +14,6 @@ dependencies: - transformers>=4.35.2 - xformers - scipy + - nltk # This works, while installing from pytorch and cuda from conda does not - torch==2.0.1 diff --git a/pyproject.toml b/pyproject.toml index e1c4382..7ce597f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ # This works, while installing from pytorch and cuda from conda does not", "torch==2.0.1", "transformers>=4.35.2", + "nltk", ] # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes) diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index a553153..d303bf5 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -1,10 +1,13 @@ import itertools import logging from timeit import default_timer as timer -from typing import List +from typing import List, Tuple +import nltk import torch import typer +from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu +from nltk.translate.meteor_score import single_meteor_score from autora.doc.runtime.predict_hf import Predictor from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts @@ -17,6 +20,33 @@ logger = logging.getLogger(__name__) +def evaluate_documentation(predictions: List[List[str]], references: List[str]) -> Tuple[float, float]: + nltk.download("wordnet") + + # Tokenize references + tokenized_references = [ref.split() for ref in references] + # Currently there is only 1 prediction for 1 reference, need to avg in future + tokenized_predictions = [pred[0].split() if pred else [] for pred in predictions] + + # Calculate BLEU score with smoothing function + # SmoothingFunction().method1 is used to avoid zero scores for n-grams not found in the reference. + bleu = corpus_bleu( + # Wrap each reference list in another list + [[tokenized_ref] for tokenized_ref in tokenized_references], + tokenized_predictions, + smoothing_function=SmoothingFunction().method1, + ) + + # Calculate METEOR scores + meteor_scores = [ + single_meteor_score(tokenized_ref, tokenized_pred) + for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions) + ] + meteor = sum(meteor_scores) / len(predictions) if predictions else 0 + + return (bleu, meteor) + + @app.command(help="Evaluate model on a data file") def eval( data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"), @@ -55,6 +85,8 @@ def eval( pred = Predictor(model_path) timer_start = timer() predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict) + bleu, meteor = evaluate_documentation(predictions, labels) + timer_end = timer() pred_time = timer_end - timer_start mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs))) @@ -63,6 +95,8 @@ def eval( mlflow.log_text(inputs[i], f"input_{i}.py") for j in range(len(predictions[i])): mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt") + mlflow.log_text("bleu_score is ", str(bleu)) + mlflow.log_text("meteor_score is ", str(meteor)) # flatten predictions for counting tokens predictions_flat = list(itertools.chain.from_iterable(predictions)) @@ -70,6 +104,8 @@ def eval( total_tokens = sum([len(token) for token in tokens]) mlflow.log_metric("total_tokens", total_tokens) mlflow.log_metric("tokens/sec", total_tokens / pred_time) + mlflow.log_metric("bleu_score", round(bleu, 5)) + mlflow.log_metric("meteor_score", round(meteor, 5)) return predictions diff --git a/tests/test_main.py b/tests/test_main.py index f5a283e..a1eed5f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,6 +1,9 @@ from pathlib import Path -from autora.doc.pipelines.main import eval, generate, import_data +import jsonlines +import pytest + +from autora.doc.pipelines.main import eval, evaluate_documentation, generate, import_data from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts # dummy HF model for testing @@ -15,6 +18,55 @@ def test_predict() -> None: assert len(output[0]) > 0, "Expected non-empty output" +def test_evaluation() -> None: + # Test Case: Meteor and Bleu scores are close to 1 + data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve() + with jsonlines.open(data) as reader: + items = [item for item in reader] + labels = [item["output"] for item in items] + predictions = [[item["output"]] for item in items] + + bleu, meteor = evaluate_documentation(predictions, labels) + assert bleu == pytest.approx(1, 0.01), f"BLEU Score is {bleu}" + assert meteor == pytest.approx(1, 0.01), f"METEOR Score is {meteor}" + + +def test_extra_token_in_prediction() -> None: + # Test Case bleu score should be less due to brevity penalty and meteor is robust to small mistakes + labels = ["this is a test"] + predictions = [["this is a test extra"]] + bleu, meteor = evaluate_documentation(predictions, labels) + assert 0.6 <= bleu <= 0.8, f"BLEU Score is {bleu}" + assert 0.8 <= meteor <= 1, f"METEOR Score is {meteor}" + + +def test_missing_token_in_prediction() -> None: + # bleu score is less, meteor is higher + labels = ["this is a test"] + predictions = [["this is a"]] + bleu, meteor = evaluate_documentation(predictions, labels) + assert 0.4 <= bleu <= 0.6, f"BLEU Score is {bleu}" + assert 0.6 <= meteor <= 0.8, f"METEOR Score is {meteor}" + + +def test_completely_different_tokens() -> None: + # both scores are less, as no common tokens + labels = ["this is a test"] + predictions = [["completely different sentence"]] + bleu, meteor = evaluate_documentation(predictions, labels) + assert bleu <= 0.1, f"BLEU Score is {bleu}" + assert meteor <= 0.1, f"METEOR Score is {meteor}" + + +def test_partially_matching_tokens() -> None: + # As ngrams arent matching because of extra token within, BLEU score is very less. Meteor gives a good score only. + labels = ["this is a test"] + predictions = [["this is a different test"]] + bleu, meteor = evaluate_documentation(predictions, labels) + assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}" + assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}" + + def test_generate() -> None: python_file = __file__ output = Path("output.txt")