diff --git a/.doctrees/apis/eval/eval.llm_as_judge.doctree b/.doctrees/apis/eval/eval.llm_as_judge.doctree index 45b9cba44..1cb9617ad 100644 Binary files a/.doctrees/apis/eval/eval.llm_as_judge.doctree and b/.doctrees/apis/eval/eval.llm_as_judge.doctree differ diff --git a/.doctrees/developer_notes/evaluation.doctree b/.doctrees/developer_notes/evaluation.doctree index fd8000537..421ce101e 100644 Binary files a/.doctrees/developer_notes/evaluation.doctree and b/.doctrees/developer_notes/evaluation.doctree differ diff --git a/.doctrees/developer_notes/index.doctree b/.doctrees/developer_notes/index.doctree index e499ac576..07d03954f 100644 Binary files a/.doctrees/developer_notes/index.doctree and b/.doctrees/developer_notes/index.doctree differ diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle index 06e250424..cc75220ea 100644 Binary files a/.doctrees/environment.pickle and b/.doctrees/environment.pickle differ diff --git a/_modules/eval/llm_as_judge.html b/_modules/eval/llm_as_judge.html index dd6822b74..ff299906c 100644 --- a/_modules/eval/llm_as_judge.html +++ b/_modules/eval/llm_as_judge.html @@ -453,7 +453,7 @@

Source code for eval.llm_as_judge

 class DefaultLLMJudge(Component):
     __doc__ = r"""Demonstrate how to use an LLM/Generator to output True or False for a judgement query.
 
-    You can use any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False.
+    You can use any any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False.
 
     A call on the LLM judge equalize to _compute_single_item method.
 
@@ -495,8 +495,8 @@ 

Source code for eval.llm_as_judge

 
         Args:
             question (str): Question string.
-            gt_answer (str): Ground truth answer string.
             pred_answer (str): Predicted answer string.
+            gt_answer (str): Ground truth answer string.
             judgement_query (str): Judgement query string.
 
         Returns:
@@ -543,7 +543,7 @@ 

Source code for eval.llm_as_judge

         >>> judgement_query = "For the question, does the predicted answer contain the ground truth answer?"
         >>> llm_judge = LLMasJudge()
         >>> avg_judgement, judgement_list = llm_judge.compute(
-        questions, gt_answers, pred_answers, judgement_query
+        questions, pred_answers, gt_answers, judgement_query
         )
         >>> avg_judgement
         2 / 3
@@ -562,8 +562,8 @@ 

Source code for eval.llm_as_judge

     def compute(
         self,
         questions: List[str],
-        gt_answers: List[str],
         pred_answers: List[str],
+        gt_answers: List[str],
         judgement_query: str,
     ) -> List[bool]:
         r"""
@@ -571,21 +571,19 @@ 

Source code for eval.llm_as_judge

 
         Args:
             questions (List[str]): List of question strings.
-            gt_answers (List[str]): List of ground truth answer strings.
             pred_answers (List[str]): List of predicted answer strings.
+            gt_answers (List[str]): List of ground truth answer strings.
             judgement_query (str): Judgement query string.
 
         Returns:
-            tuple:
-                - float: Average judgement score.
-                - List[bool]: Judgement results for each query.
+            List[bool]: Judgement results.
         """
         judgement_list = []
-        for question, gt_answer, pred_answer in zip(
-            questions, gt_answers, pred_answers
+        for question, pred_answer, gt_answer in zip(
+            questions, pred_answers, gt_answers
         ):
             judgement = self.llm_evaluator(
-                question, gt_answer, pred_answer, judgement_query
+                question, pred_answer, gt_answer, judgement_query
             )
             judgement_list.append(judgement)
 
@@ -608,7 +606,7 @@ 

Source code for eval.llm_as_judge

     )
     llm_judge = LLMasJudge()
     avg_judgement, judgement_list = llm_judge.compute(
-        questions, gt_answers, pred_answers, judgement_query
+        questions, pred_answers, gt_answers, judgement_query
     )
     print(avg_judgement)
     print(judgement_list)
diff --git a/_sources/developer_notes/evaluation.rst.txt b/_sources/developer_notes/evaluation.rst.txt
index b552576ec..9b9315eb8 100644
--- a/_sources/developer_notes/evaluation.rst.txt
+++ b/_sources/developer_notes/evaluation.rst.txt
@@ -73,39 +73,31 @@ If you are interested in computing metrics such as accuracy, F1-score, ROUGE, BE
 
 If you are particulay interested in evaluating RAG (Retrieval-Augmented Generation) pipelines, we have several metrics available in LightRAG to assess both the quality of the retrieved context and the quality of the final generated answer.
 
-- :class:`RetrieverRecall `: This is used to evaluate the recall of the retriever component of the RAG pipeline.
-- :class:`RetrieverRelevance `: This is used to evaluate the relevance of the retrieved context to the query.
-- :class:`AnswerMatchAcc `: This calculates the exact match accuracy or fuzzy match accuracy of the generated answers by comparing them to the ground truth answers.
-- :class:`LLMasJudge `: This uses an LLM to get the judgement of the generated answer for a list of questions. The task description and the judgement query of the LLM judge can be customized. It computes the judgement score, which is the number of generated answers that are judged as correct by the LLM divided by the total number of generated answers.
+- :class:`RetrieverEvaluator `: This evaluator is used to evaluate the performance of the retriever component of the RAG pipeline. It has metric functions to compute the recall and context relevance of the retriever.
+- :class:`AnswerMacthEvaluator `: This evaluator is used to evaluate the performance of the generator component of the RAG pipeline. It has metric functions to compute the exact match and fuzzy match accuracy of the generated answer.
+- :class:`LLMasJudge `: This evaluator uses an LLM to get the judgement of the predicted answer for a list of questions. The task description and the judgement query of the LLM judge can be customized. It has a metric function to compute the judgement score, which is the number of generated answers that are judged as correct by the LLM divided by the total number of generated answers.
 
 For example, you can use the following code snippet to compute the recall and relevance of the retriever component of the RAG pipeline for a single query.
 
 .. code-block:: python
     :linenos:
 
-    from lightrag.eval import RetrieverRecall, RetrieverRelevance
-    retrieved_contexts = [
-        "Apple is founded before Google.",
-        "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.",
-    ]
-    gt_contexts = [
-        [
-            "Apple is founded in 1976.",
-            "Google is founded in 1998.",
-            "Apple is founded before Google.",
-        ],
-        ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"],
-    ]
-    retriever_recall = RetrieverRecall()
-    avg_recall, recall_list = retriever_recall.compute(retrieved_contexts, gt_contexts) # Compute the recall of the retriever
-    print(f"Recall: {avg_recall}, Recall List: {recall_list}")
-    # Recall: 0.6666666666666666, Recall List: [0.3333333333333333, 1.0]
-    retriever_relevance = RetrieverRelevance()
-    avg_relevance, relevance_list = retriever_relevance.compute(retrieved_contexts, gt_contexts) # Compute the relevance of the retriever
-    print(f"Relevance: {avg_relevance}, Relevance List: {relevance_list}")
-    # Relevance: 0.803030303030303, Relevance List: [1.0, 0.6060606060606061]
-
-For a more detailed instructions on how build and evaluate RAG pipelines, you can refer to the use case on :doc:`Evaluating a RAG Pipeline <../tutorials/eval_a_rag>`.
+    from eval.evaluators import RetrieverEvaluator
+    retrieved_context = "Apple is founded before Google." # Retrieved context
+    gt_context = ["Apple is founded in 1976.",
+                  "Google is founded in 1998.",
+                  "Apple is founded before Google."] # Ground truth context
+    retriever_evaluator = RetrieverEvaluator() # Initialize the RetrieverEvaluator
+    recall = retriever_evaluator.compute_recall_single_query(
+        retrieved_context, gt_context
+    ) # Compute the recall of the retriever
+    relevance = retriever_evaluator.compute_context_relevance_single_query(
+        retrieved_context, gt_context
+    ) # Compute the relevance of the retriever
+    print(f"Recall: {recall}, Relevance: {relevance}")
+    # Recall: 0.3333333333333333, Relevance: 1.0
+
+For a more detailed instructions on how to use these evaluators to evaluate RAG pipelines, you can refer to the tutorial on :doc:`Evaluating a RAG Pipeline <../tutorials/eval_a_rag>`, where we provide a step-by-step guide on how to use these evaluators to evaluate a RAG pipeline on HotpotQA dataset.
 
 If you intent to use metrics that are not available in the LightRAG library, you can also implement your own custom metric functions or use other libraries such as `RAGAS `_ to compute the desired metrics for evaluating RAG pipelines.
 
diff --git a/_sources/developer_notes/index.rst.txt b/_sources/developer_notes/index.rst.txt
index b53249c89..bd92915a2 100644
--- a/_sources/developer_notes/index.rst.txt
+++ b/_sources/developer_notes/index.rst.txt
@@ -76,7 +76,6 @@ Code path: :ref:`lightrag.core `.
 
 RAG Essentials
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 RAG components
 ^^^^^^^^^^^^^^^^^^^
 
@@ -99,6 +98,7 @@ Code path: :ref:`lightrag.core`. For abstract classes:
      - ``ModelClient`` is the protocol and base class for LightRAG to **integrate all models**, either APIs or local, LLMs or Embedding models or any others.
    * - :doc:`generator`
      - The orchestrator for LLM prediction. It streamlines three components: `ModelClient`, `Prompt`, and `output_processors` and works with optimizer for prompt optimization.
+     - The **center component** that orchestrates the model client(LLMs in particular), prompt, and output processors for format parsing or any post processing.
    * - :doc:`output_parsers`
      - The component that parses the output string to structured data.
    * - :doc:`embedder`
@@ -106,7 +106,6 @@ Code path: :ref:`lightrag.core`. For abstract classes:
    * - :doc:`retriever`
      - The base class for all retrievers who in particular retrieve relevant documents from a given database to add **context** to the generator.
 
-
 Data Pipeline and Storage
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/apis/eval/eval.llm_as_judge.html b/apis/eval/eval.llm_as_judge.html
index fe09931f9..12d8427ad 100644
--- a/apis/eval/eval.llm_as_judge.html
+++ b/apis/eval/eval.llm_as_judge.html
@@ -521,7 +521,7 @@
 class DefaultLLMJudge(model_client: ModelClient | None = None, model_kwargs: Dict[str, Any] | None = None)[source]#
 

Bases: Component

Demonstrate how to use an LLM/Generator to output True or False for a judgement query.

-

You can use any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False.

+

You can use any any of your template to adapt to more tasks and sometimes you can directly ask LLM to output a score in range [0, 1] instead of only True or False.

A call on the LLM judge equalize to _compute_single_item method.

Parameters:
@@ -539,8 +539,8 @@
Parameters:
  • question (str) – Question string.

  • -
  • gt_answer (str) – Ground truth answer string.

  • pred_answer (str) – Predicted answer string.

  • +
  • gt_answer (str) – Ground truth answer string.

  • judgement_query (str) – Judgement query string.

@@ -576,7 +576,7 @@ >>> judgement_query = "For the question, does the predicted answer contain the ground truth answer?" >>> llm_judge = LLMasJudge() >>> avg_judgement, judgement_list = llm_judge.compute( -questions, gt_answers, pred_answers, judgement_query +questions, pred_answers, gt_answers, judgement_query ) >>> avg_judgement 2 / 3 @@ -586,26 +586,22 @@
-compute(questions: List[str], gt_answers: List[str], pred_answers: List[str], judgement_query: str) List[bool][source]#
+compute(questions: List[str], pred_answers: List[str], gt_answers: List[str], judgement_query: str) List[bool][source]#

Get the judgement of the predicted answer for a list of questions.

Parameters:
  • questions (List[str]) – List of question strings.

  • -
  • gt_answers (List[str]) – List of ground truth answer strings.

  • pred_answers (List[str]) – List of predicted answer strings.

  • +
  • gt_answers (List[str]) – List of ground truth answer strings.

  • judgement_query (str) – Judgement query string.

Returns:
-

    -
  • float: Average judgement score.

  • -
  • List[bool]: Judgement results for each query.

  • -
-

+

Judgement results.

Return type:
-

tuple

+

List[bool]

diff --git a/developer_notes/evaluation.html b/developer_notes/evaluation.html index dc6c500bc..6c8108b15 100644 --- a/developer_notes/evaluation.html +++ b/developer_notes/evaluation.html @@ -524,36 +524,28 @@

How to evaluate? -
  • RetrieverRecall: This is used to evaluate the recall of the retriever component of the RAG pipeline.

  • -
  • RetrieverRelevance: This is used to evaluate the relevance of the retrieved context to the query.

  • -
  • AnswerMatchAcc: This calculates the exact match accuracy or fuzzy match accuracy of the generated answers by comparing them to the ground truth answers.

  • -
  • LLMasJudge: This uses an LLM to get the judgement of the generated answer for a list of questions. The task description and the judgement query of the LLM judge can be customized. It computes the judgement score, which is the number of generated answers that are judged as correct by the LLM divided by the total number of generated answers.

  • +
  • RetrieverEvaluator: This evaluator is used to evaluate the performance of the retriever component of the RAG pipeline. It has metric functions to compute the recall and context relevance of the retriever.

  • +
  • AnswerMacthEvaluator: This evaluator is used to evaluate the performance of the generator component of the RAG pipeline. It has metric functions to compute the exact match and fuzzy match accuracy of the generated answer.

  • +
  • LLMasJudge: This evaluator uses an LLM to get the judgement of the predicted answer for a list of questions. The task description and the judgement query of the LLM judge can be customized. It has a metric function to compute the judgement score, which is the number of generated answers that are judged as correct by the LLM divided by the total number of generated answers.

  • For example, you can use the following code snippet to compute the recall and relevance of the retriever component of the RAG pipeline for a single query.

    -
     1from lightrag.eval import RetrieverRecall, RetrieverRelevance
    - 2retrieved_contexts = [
    - 3    "Apple is founded before Google.",
    - 4    "Feburary has 28 days in common years. Feburary has 29 days in leap years. Feburary is the second month of the year.",
    - 5]
    - 6gt_contexts = [
    - 7    [
    - 8        "Apple is founded in 1976.",
    - 9        "Google is founded in 1998.",
    -10        "Apple is founded before Google.",
    -11    ],
    -12    ["Feburary has 28 days in common years", "Feburary has 29 days in leap years"],
    -13]
    -14retriever_recall = RetrieverRecall()
    -15avg_recall, recall_list = retriever_recall.compute(retrieved_contexts, gt_contexts) # Compute the recall of the retriever
    -16print(f"Recall: {avg_recall}, Recall List: {recall_list}")
    -17# Recall: 0.6666666666666666, Recall List: [0.3333333333333333, 1.0]
    -18retriever_relevance = RetrieverRelevance()
    -19avg_relevance, relevance_list = retriever_relevance.compute(retrieved_contexts, gt_contexts) # Compute the relevance of the retriever
    -20print(f"Relevance: {avg_relevance}, Relevance List: {relevance_list}")
    -21# Relevance: 0.803030303030303, Relevance List: [1.0, 0.6060606060606061]
    +
     1from eval.evaluators import RetrieverEvaluator
    + 2retrieved_context = "Apple is founded before Google." # Retrieved context
    + 3gt_context = ["Apple is founded in 1976.",
    + 4              "Google is founded in 1998.",
    + 5              "Apple is founded before Google."] # Ground truth context
    + 6retriever_evaluator = RetrieverEvaluator() # Initialize the RetrieverEvaluator
    + 7recall = retriever_evaluator.compute_recall_single_query(
    + 8    retrieved_context, gt_context
    + 9) # Compute the recall of the retriever
    +10relevance = retriever_evaluator.compute_context_relevance_single_query(
    +11    retrieved_context, gt_context
    +12) # Compute the relevance of the retriever
    +13print(f"Recall: {recall}, Relevance: {relevance}")
    +14# Recall: 0.3333333333333333, Relevance: 1.0
     
    -

    For a more detailed instructions on how build and evaluate RAG pipelines, you can refer to the use case on Evaluating a RAG Pipeline.

    +

    For a more detailed instructions on how to use these evaluators to evaluate RAG pipelines, you can refer to the tutorial on Evaluating a RAG Pipeline, where we provide a step-by-step guide on how to use these evaluators to evaluate a RAG pipeline on HotpotQA dataset.

    If you intent to use metrics that are not available in the LightRAG library, you can also implement your own custom metric functions or use other libraries such as RAGAS to compute the desired metrics for evaluating RAG pipelines.