From ef8058883c15a1135c362263f972cc01e154ecfd Mon Sep 17 00:00:00 2001 From: Carlos Garcia Jurado Suarez Date: Tue, 6 Feb 2024 13:46:25 -0800 Subject: [PATCH 1/2] feat: Implement SemScore metric --- .github/actions/deps/action.yaml | 2 +- .mypy.ini | 3 +++ README.md | 4 ++-- notebooks/generate.ipynb | 18 ++++++++++++------ pyproject.toml | 3 +-- src/autora/doc/classes/EvalResult.py | 3 ++- src/autora/doc/pipelines/main.py | 20 +++++++------------- src/autora/doc/pipelines/metrics.py | 23 ++++++++++++++++++++++- tests/test_main.py | 8 ++++---- tests/test_metrics.py | 16 +++++++++++++++- 10 files changed, 69 insertions(+), 31 deletions(-) diff --git a/.github/actions/deps/action.yaml b/.github/actions/deps/action.yaml index fff24c8..2d6bb83 100644 --- a/.github/actions/deps/action.yaml +++ b/.github/actions/deps/action.yaml @@ -18,4 +18,4 @@ runs: python3 -m pip install --upgrade pip - name: Install project shell: sh - run: pip install ".[dev,train,cuda]" + run: pip install ".[dev,pipelines,cuda]" diff --git a/.mypy.ini b/.mypy.ini index 6e719c2..9bf3cdf 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -11,3 +11,6 @@ ignore_missing_imports = True [mypy-nltk.*] ignore_missing_imports = True + +[mypy-sentence_transformers.*] +ignore_missing_imports = True diff --git a/README.md b/README.md index d51e74b..de851a7 100644 --- a/README.md +++ b/README.md @@ -31,14 +31,14 @@ Once you have created a new environment, you can install this project for local development using the following commands: ``` ->> pip install -e .'[dev,train]' +>> pip install -e .'[dev,pipelines]' >> pre-commit install >> conda install pandoc ``` Notes: 1) The single quotes around `'[dev]'` may not be required for your operating system. -3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA. +3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,pipelines,cuda]"` if you want to use CUDA. 2) `pre-commit install` will initialize pre-commit for this local repository, so that a set of tests will be run prior to completing a local commit. For more information, see the Python Project Template documentation on diff --git a/notebooks/generate.ipynb b/notebooks/generate.ipynb index 510de85..6e619bb 100644 --- a/notebooks/generate.ipynb +++ b/notebooks/generate.ipynb @@ -25,7 +25,7 @@ "# Uncomment to clone and install autodoc from GitHub\n", "# !pip uninstall -y autora-doc\n", "# !git clone https://github.com/AutoResearch/autodoc.git\n", - "# !pip install \"./autodoc[cuda,train]\"\n", + "# !pip install \"./autodoc[cuda,pipelines]\"\n", "\n", "# IMPORTANT: Please restart the runtime after running the above commands" ] @@ -42,7 +42,7 @@ "%autoreload 2\n", "from autora.doc.runtime.predict_hf import Predictor, preprocess_code\n", "from autora.doc.runtime.prompts import PROMPTS, PromptIds, PromptBuilder, SYS_GUIDES\n", - "from autora.doc.pipelines.main import evaluate_documentation\n", + "from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore\n", "from autora.doc.pipelines.main import eval_prompt, load_data" ] }, @@ -111,9 +111,13 @@ " top_k=10,\n", " num_ret_seq=1,\n", " )\n", - " bleu, meteor = evaluate_documentation(output, [label])\n", + " bleu, meteor = eval_bleu_meteor(output, [label])\n", + " sem_score = eval_semscore(output, [label])\n", + "\n", " for i, o in enumerate(output):\n", - " print(f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}\\n{o}\\n*************\\n\")" + " print(\n", + " f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}, sem_score={sem_score}\\n{o}\\n*************\\n\"\n", + " )" ] }, { @@ -176,8 +180,10 @@ }, "outputs": [], "source": [ - "out, bleu, meteor = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n", - "print(f\"bleu={bleu}, meteor={meteor}\\n{out[0]}\\n*************\\n\")" + "eval_result = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n", + "print(\n", + " f\"bleu={eval_result.bleu_score}, meteor={eval_result.meteor_score}, sem_score={eval_result.sem_score}\\n{eval_result.predictions[0]}\\n*************\\n\"\n", + ")" ] }, { diff --git a/pyproject.toml b/pyproject.toml index ba26438..b553524 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ dependencies = [ # This works, while installing from pytorch and cuda from conda does not", "torch==2.0.1", "transformers>=4.37.2", - "nltk", ] # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes) @@ -44,7 +43,7 @@ dev = [ "ipykernel", "hf_transfer", ] -train = ["jsonlines", "mlflow"] +pipelines = ["jsonlines", "mlflow", "nltk", "sentence-transformers>=2.3.1"] azure = ["azureml-core", "azureml-mlflow"] cuda = ["bitsandbytes>=0.42.0", "accelerate>=0.24.1", "xformers"] diff --git a/src/autora/doc/classes/EvalResult.py b/src/autora/doc/classes/EvalResult.py index 993a848..d0a8422 100644 --- a/src/autora/doc/classes/EvalResult.py +++ b/src/autora/doc/classes/EvalResult.py @@ -6,7 +6,8 @@ class EvalResult: """Class for storing LLM evaluation results""" - prediction: List[str] + predictions: List[str] prompt: str bleu_score: Optional[float] = None meteor_score: Optional[float] = None + sem_score: Optional[float] = None diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index 69c1f55..4a133e0 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -7,7 +7,7 @@ import typer from autora.doc.classes.EvalResult import EvalResult -from autora.doc.pipelines.metrics import eval_bleu_meteor +from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore from autora.doc.runtime.predict_hf import Predictor from autora.doc.runtime.prompts import PROMPTS, PromptIds from autora.doc.util import get_prompts_from_file @@ -52,14 +52,8 @@ def eval_prompts( predictor = Predictor(model_path) for i in range(len(prompts_list)): logger.info(f"Starting to run model on prompt {i}") - prediction_with_scores = eval_prompt(data_file, predictor, prompts_list[i], param_dict) + eval_result = eval_prompt(data_file, predictor, prompts_list[i], param_dict) logger.info(f"Model run completed on prompt {i}: {prompts_list[i]}") - eval_result = EvalResult( - prediction_with_scores[0], - prompts_list[i], - prediction_with_scores[1], - prediction_with_scores[2], - ) results_list.append(eval_result) return results_list @@ -72,7 +66,7 @@ def eval( param: List[str] = typer.Option( [], help="Additional float parameters to pass to the model as name=float pairs" ), -) -> Tuple[List[str], float, float]: +) -> EvalResult: import mlflow mlflow.autolog() @@ -104,9 +98,7 @@ def load_data(data_file: str) -> Tuple[List[str], List[str]]: return inputs, labels -def eval_prompt( - data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float] -) -> Tuple[List[str], float, float]: +def eval_prompt(data_file: str, pred: Predictor, prompt: str, param_dict: Dict[str, float]) -> EvalResult: import mlflow inputs, labels = load_data(data_file) @@ -115,6 +107,7 @@ def eval_prompt( predictions = pred.predict(prompt, inputs, **param_dict) timer_end = timer() bleu, meteor = eval_bleu_meteor(predictions, labels) + semscore = eval_semscore(predictions, labels) pred_time = timer_end - timer_start mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs))) for i in range(len(inputs)): @@ -133,7 +126,8 @@ def eval_prompt( mlflow.log_metric("tokens/sec", total_tokens / pred_time) mlflow.log_metric("bleu_score", round(bleu, 5)) mlflow.log_metric("meteor_score", round(meteor, 5)) - return predictions, bleu, meteor + mlflow.log_metric("semscore", round(semscore, 5)) + return EvalResult(predictions, prompt, bleu, meteor, semscore) @app.command() diff --git a/src/autora/doc/pipelines/metrics.py b/src/autora/doc/pipelines/metrics.py index 92bdf4c..5b87e06 100644 --- a/src/autora/doc/pipelines/metrics.py +++ b/src/autora/doc/pipelines/metrics.py @@ -3,6 +3,9 @@ import nltk from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu from nltk.translate.meteor_score import single_meteor_score +from numpy import dot, mean, nan_to_num +from numpy.linalg import norm +from sentence_transformers import SentenceTransformer def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[float, float]: @@ -27,6 +30,24 @@ def eval_bleu_meteor(predictions: List[str], references: List[str]) -> Tuple[flo single_meteor_score(tokenized_ref, tokenized_pred) for tokenized_ref, tokenized_pred in zip(tokenized_references, tokenized_predictions) ] - meteor = sum(meteor_scores) / len(predictions) if predictions else 0 + meteor: float = nan_to_num(mean(meteor_scores), nan=0) return (bleu, meteor) + + +def eval_semscore(predictions: List[str], references: List[str]) -> float: + """ + Calculate sentence embedding similarity score. + https://arxiv.org/pdf/2401.17072.pdf + """ + model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") + + def score(pred: str, ref: str) -> float: + encodings = model.encode([pred, ref]) + assert len(encodings) == 2 + cos_dist: float = dot(encodings[0], encodings[1]) / norm(encodings[0]) * norm(encodings[1]) + return cos_dist + + scores = [score(pred, ref) for pred, ref in zip(predictions, references)] + semscore: float = nan_to_num(mean(scores), nan=0) + return semscore diff --git a/tests/test_main.py b/tests/test_main.py index 13be933..2e9ba66 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -11,9 +11,9 @@ def test_predict() -> None: data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve() - outputs, _, _ = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, []) - assert len(outputs) == 3, "Expected 3 outputs" - for output in outputs: + eval_result = eval(str(data), TEST_HF_MODEL, PromptIds.SWEETP_1, []) + assert len(eval_result.predictions) == 3, "Expected 3 outputs" + for output in eval_result.predictions: assert len(output) > 0, "Expected non-empty output" @@ -42,5 +42,5 @@ def test_eval_prompts() -> None: results: List[EvalResult] = eval_prompts(str(data_file), TEST_HF_MODEL, str(prompts_file), []) assert len(results) == 3, "Expected 3 outputs" for result in results: - assert result.prediction is not None, "The prediction should not be None" + assert result.predictions is not None, "The prediction should not be None" assert result.prompt is not None, "The prompt should not be None" diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 693b93a..8983862 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -3,7 +3,7 @@ import jsonlines import pytest -from autora.doc.pipelines.metrics import eval_bleu_meteor +from autora.doc.pipelines.metrics import eval_bleu_meteor, eval_semscore def test_evaluation() -> None: @@ -53,3 +53,17 @@ def test_partially_matching_tokens() -> None: bleu, meteor = eval_bleu_meteor(predictions, labels) assert 0.25 <= bleu <= 0.4, f"BLEU Score is {bleu}" assert 0.8 <= meteor <= 0.95, f"METEOR Score is {meteor}" + + +def test_semscore() -> None: + # Test Case: SemScore is close to 1 + labels = ["this is really good"] + predictions = ["this is great"] + semscore = eval_semscore(predictions, labels) + assert semscore >= 0.6, f"SemScore is {semscore}" + + semscore = eval_semscore(labels, labels) + assert semscore == 1, f"SemScore is {semscore}" + + semscore = eval_semscore([], []) + assert semscore == 0, f"SemScore is {semscore}" From 1e2ce17f628a6114321687fc32a00eac0d964dc1 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Jurado Suarez Date: Tue, 6 Feb 2024 14:20:31 -0800 Subject: [PATCH 2/2] update conda.yml for azure pipeline --- azureml/conda.yml | 1 + tests/test_metrics.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/azureml/conda.yml b/azureml/conda.yml index fa4aa98..c69a214 100644 --- a/azureml/conda.yml +++ b/azureml/conda.yml @@ -17,3 +17,4 @@ dependencies: - nltk # This works, while installing from pytorch and cuda from conda does not - torch==2.0.1 + - sentence_transformers>=2.3.1 diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 8983862..73782c0 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -63,7 +63,7 @@ def test_semscore() -> None: assert semscore >= 0.6, f"SemScore is {semscore}" semscore = eval_semscore(labels, labels) - assert semscore == 1, f"SemScore is {semscore}" + assert semscore == pytest.approx(1.0), f"SemScore is {semscore}" semscore = eval_semscore([], []) assert semscore == 0, f"SemScore is {semscore}"