From ccdffc9b30d5989a906d7489b95b27de2f77ed22 Mon Sep 17 00:00:00 2001 From: Li Yin Date: Mon, 2 Sep 2024 09:02:00 -0700 Subject: [PATCH] update the datasets and evaluation guideline --- README.md | 2 +- docs/source/tutorials/datasets.rst | 16 +++++- docs/source/tutorials/evaluation.rst | 53 +++++++++++++++---- docs/source/tutorials/index.rst | 12 ++--- docs/source/use_cases/classification.rst | 2 + docs/source/use_cases/question_answering.rst | 2 +- tutorials/evaluation/eval.py | 34 ++++++++++++ .../bbh/object_count/train_new.py | 2 +- 8 files changed, 103 insertions(+), 20 deletions(-) create mode 100644 tutorials/evaluation/eval.py diff --git a/README.md b/README.md index 9e0a6c78..e271b56c 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ Many existing works greatly inspired AdalFlow library! Here is a non-exhaustive - 📚 [Micrograd](https://github.com/karpathy/micrograd): A tiny autograd engine for our auto-differentiative architecture. - 📚 [Text-Grad](https://github.com/zou-group/textgrad) for the ``Textual Gradient Descent`` text optimizer. - 📚 [DSPy](https://github.com/stanfordnlp/dspy) for inspiring the ``__{input/output}__fields`` in our ``DataClass`` and the bootstrap few-shot optimizer. -- 📚 [ORPO](https://github.com/google-deepmind/opro) for adding past text instruction along with its accuracy in the text optimizer. +- 📚 [OPRO](https://github.com/google-deepmind/opro) for adding past text instruction along with its accuracy in the text optimizer. - 📚 [PyTorch Lightning](https://github.com/Lightning-AI/pytorch-lightning) for the ``AdalComponent`` and ``Trainer``. # Citation diff --git a/docs/source/tutorials/datasets.rst b/docs/source/tutorials/datasets.rst index 186ce090..b079633d 100644 --- a/docs/source/tutorials/datasets.rst +++ b/docs/source/tutorials/datasets.rst @@ -2,4 +2,18 @@ Datasets ================ -Coming soon! + +Datasets are wrapped in a :class:`Dataset` object. +The `Dataset` often will be used together with :class:`utils.data.DataLoader` to load data in batches. +DataLoader can also handle parallel data loading with multiple workers and apply data shuffling. + +To be able to use your data, you need to: + +1. Create a subclass of :class:`DataClass` that defines the data structure, including a unique identifier, input and output fields for LLM calls. + +2. Create a subclass of :class:`utils.data.Dataset` that defines how to load the data (local/cloud), split the data, and convert it to your defined DataClass, and how to load and preprocess the data. Optionally you can use PyTorch's dataset, the only thing is it often works with Tensor, you will need to convert it back to normal data at some point. + +In default, AdalFlow saved any downloaded datasets in the `~/.adalflow/cached_datasets` directory. + +You can see plenty of examples in the :ref:`apis-datasets` directory. +The examples of `DataClass` can be found at :ref:`datasets-types`. diff --git a/docs/source/tutorials/evaluation.rst b/docs/source/tutorials/evaluation.rst index 3641c5b9..0c10a752 100644 --- a/docs/source/tutorials/evaluation.rst +++ b/docs/source/tutorials/evaluation.rst @@ -14,7 +14,7 @@ Overall, such evaluation is a complex and multifaceted process. Below, we provid * **How to evaluate**: the protocols and metrics that are used for evaluation. -What to evaluate? +Tasks and Capabilities to Evaluate ------------------------------------------ When we are considering the LLM evaluation, the first question that arises is what to evaluate. Deciding what tasks to evaluate or which capabilities to assess is crucial, as it influences both the selection of appropriate benchmarks (where to evaluate) and the choice of evaluation methods (how to evaluate). Below are some commonly evaluated tasks and capabilities of LLMs: @@ -28,7 +28,7 @@ When we are considering the LLM evaluation, the first question that arises is wh For a more detailed and comprehensive description of the tasks and capabilities that LLMs are evaluated on, please refer to the review papers by *Chang et al.* [1]_ and *Guo et al.* [2]_. -Where to evaluate? +Datasets and Benchmarks ------------------------------------------ Once we have decided what to evaluate, the next question is where to evaluate. The selection of datasets and benchmarks is important, as it determines the quality and relevance of the evaluation. @@ -48,17 +48,37 @@ Please refer to the review papers (*Chang et al.* [1]_, *Guo et al.* [2]_, and * from datasets import load_dataset dataset = load_dataset(path="cais/mmlu", name='abstract_algebra') print(dataset["test"]) - # Dataset({ - # features: ['question', 'subject', 'choices', 'answer'], - # num_rows: 100 - # }) -How to evaluate? +The output will be a Dataset object containing the test set of the MMLU dataset. + +.. code-block:: json + Dataset({ + features: ['question', 'subject', 'choices', 'answer'], + num_rows: 100 + }) + +Evaluation Metrics ------------------------------------------ -The final question is how to evaluate. Evaluation methods can be divided into *automated evaluation* and *human evaluation* (*Chang et al.* [1]_ and *Liu et al.* [6]_). Automated evaluation typically involves using metrics such as accuracy and BERTScore or employing an LLM as the judge, to quantitatively assess the performance of LLMs on specific tasks. Human evaluation, on the other hand, involves human in the loop to evaluate the quality of the generated text or the performance of the LLM. Here, we recommend a few automated evaluation methods that can be used to evaluate LLMs and their applications. +The final question is how to evaluate. +Evaluation methods can be divided into *automated evaluation* and *human evaluation* (*Chang et al.* [1]_ and *Liu et al.* [6]_). +Automated evaluation typically involves using metrics such as accuracy and BERTScore or employing an LLM as the judge, to quantitatively assess the performance of LLMs on specific tasks. +Human evaluation, on the other hand, involves human in the loop to evaluate the quality of the generated text or the performance of the LLM. + +Here, we recommend a few automated evaluation methods that can be used to evaluate LLMs and their applications. + +1. For classicial NLU tasks, such as text classification and sentiment analysis, you can use metrics such as accuracy, F1-score, and ROC-AUC to evaluate the performance of LLM response just like you would do using non-genAI models. +You can check out `TorchMetrics `_. -If you are interested in computing metrics such as accuracy, F1-score, ROUGE, BERTScore, perplexity, etc for LLMs and LLM applications, you can check out the metrics provided by `Hugging Face Metrics `_ or `TorchMetrics `_. For instance, to compute the BERTScore, you can use the corresponding metric function provided by Hugging Face, which uses the pre-trained contextual embeddings from BERT and matched words in generated text and reference text by cosine similarity. +2. For NLG tasks, such as text summarization, translation, and question answering: (1) you can use metrics such as ROUGE, BLEU, METEOR, and BERTScore, perplexity, :class:`LLMasJudge ` etc to evaluate the quality of the generated text with respect to the reference text. +You can check out the metrics provided by `Hugging Face Metrics `_ or . +For instance, to compute the BERTScore, you can use the corresponding metric function provided by Hugging Face, which uses the pre-trained contextual embeddings from BERT and matched words in generated text and reference text by cosine similarity. +(2) When you have no reference text, :class:`LLMasJudge ` with advanced model can be used to evaluate the generated text on the fly. + +3. For RAG (Retrieval-Augmented Generation) pipelines, you can use metrics such as :class:`RetrieverRecall `, :class:`RetrieverRelevance `, :class:`AnswerMatchAcc `, and :class:`LLMasJudge ` to evaluate the quality of the retrieved context and the generated answer. + +NLG Evaluation Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python :linenos: @@ -69,12 +89,24 @@ If you are interested in computing metrics such as accuracy, F1-score, ROUGE, BE reference_text = ["life is great", "make it to the moon"] results = bertscore.compute(predictions=generated_text, references=reference_text, model_type="distilbert-base-uncased") print(results) - # {'precision': [0.9419728517532349, 0.7959791421890259], 'recall': [0.9419728517532349, 0.7749403119087219], 'f1': [0.9419728517532349, 0.7853187918663025], 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.38.2)'} +The output will be a dictionary containing the precision, recall, and F1-score of the BERTScore metric for the generated text compared to the reference text. + +.. code-block:: json + + {'precision': [0.9419728517532349, 0.7959791421890259], 'recall': [0.9419728517532349, 0.7749403119087219], 'f1': [0.9419728517532349, 0.7853187918663025], 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.38.2)'} + +RAG Evaluation Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you are particulay interested in evaluating RAG (Retrieval-Augmented Generation) pipelines, we have several metrics available in AdalFlow to assess both the quality of the retrieved context and the quality of the final generated answer. +For the retriever: + - :class:`RetrieverRecall `: This is used to evaluate the recall of the retriever component of the RAG pipeline. - :class:`RetrieverRelevance `: This is used to evaluate the relevance of the retrieved context to the query. + +For the generator: + - :class:`AnswerMatchAcc `: This calculates the exact match accuracy or fuzzy match accuracy of the generated answers by comparing them to the ground truth answers. - :class:`LLMasJudge `: This uses an LLM to get the judgement of the generated answer for a list of questions. The task description and the judgement query of the LLM judge can be customized. It computes the judgement score, which is the number of generated answers that are judged as correct by the LLM divided by the total number of generated answers. @@ -84,6 +116,7 @@ For example, you can use the following code snippet to compute the recall and re :linenos: from adalflow.eval import RetrieverRecall, RetrieverRelevance + retrieved_contexts = [ "Apple is founded before Google.", "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.", diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst index f3cca718..0512fb84 100644 --- a/docs/source/tutorials/index.rst +++ b/docs/source/tutorials/index.rst @@ -229,6 +229,8 @@ This section we will briefly cover the datasets and evaluation metrics supported Evaluating ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +You can not optimize what you can not meature. +In this section, we provide a general guide to the evaluation datasets, metrics, and methods to productionize your LLM tasks and to publish your research. .. list-table:: :widths: 20 80 @@ -236,20 +238,18 @@ Evaluating * - Part - Description - * - :doc:`datasets` - - The datasets used in the evaluation. * - :doc:`evaluation` - - The evaluation metrics and methods. + - A quick guide to the evaluation datasets, metrics, and methods. + * - :doc:`datasets` + - How to load and use the datasets in the library. .. toctree:: :maxdepth: 1 :caption: Evaluating :hidden: - - datasets - evaluation + datasets Training diff --git a/docs/source/use_cases/classification.rst b/docs/source/use_cases/classification.rst index ca0a49d0..2a0bf558 100644 --- a/docs/source/use_cases/classification.rst +++ b/docs/source/use_cases/classification.rst @@ -1,3 +1,5 @@ + + Classification Optimization ============================= diff --git a/docs/source/use_cases/question_answering.rst b/docs/source/use_cases/question_answering.rst index e2e0029e..d6001e5a 100644 --- a/docs/source/use_cases/question_answering.rst +++ b/docs/source/use_cases/question_answering.rst @@ -4,7 +4,7 @@ Try Quickstart in Colab - + GitHub Open Source Code diff --git a/tutorials/evaluation/eval.py b/tutorials/evaluation/eval.py new file mode 100644 index 00000000..70048752 --- /dev/null +++ b/tutorials/evaluation/eval.py @@ -0,0 +1,34 @@ +from adalflow.eval import RetrieverRecall, RetrieverRelevance + +retrieved_contexts = [ + "Apple is founded before Google.", + "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.", +] +gt_contexts = [ + [ + "Apple is founded in 1976.", + "Google is founded in 1998.", + "Apple is founded before Google.", + ], + ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"], +] + + +def evaluate_retriever(retrieved_contexts, gt_contexts): + retriever_recall = RetrieverRecall() + avg_recall, recall_list = retriever_recall.compute( + retrieved_contexts, gt_contexts + ) # Compute the recall of the retriever + retriever_relevance = RetrieverRelevance() + avg_relevance, relevance_list = retriever_relevance.compute( + retrieved_contexts, gt_contexts + ) # Compute the relevance of the retriever + return avg_recall, recall_list, avg_relevance, relevance_list + + +if __name__ == "__main__": + avg_recall, recall_list, avg_relevance, relevance_list = evaluate_retriever( + retrieved_contexts, gt_contexts + ) + print(f"avg_recall: {avg_recall}, recall_list: {recall_list}") + print(f"avg_relevance: {avg_relevance}, relevance_list: {relevance_list}") diff --git a/use_cases/question_answering/bbh/object_count/train_new.py b/use_cases/question_answering/bbh/object_count/train_new.py index 254c7c96..dbc57e61 100644 --- a/use_cases/question_answering/bbh/object_count/train_new.py +++ b/use_cases/question_answering/bbh/object_count/train_new.py @@ -2,7 +2,7 @@ ObjectCountTaskPipeline, ) -from LightRAG.use_cases.config import ( +from use_cases.config import ( gpt_3_model, gpt_4o_model, )