From c28ac8dbfa4621b6eb75c9181b1c00dd14602d9a Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 13 Nov 2024 11:32:29 +0530 Subject: [PATCH 01/13] feat: make aspect_critic more general --- src/ragas/metrics/_aspect_critic.py | 182 +++++----------------------- src/ragas/prompt/pydantic_prompt.py | 6 +- 2 files changed, 31 insertions(+), 157 deletions(-) diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py index ea3a90e22..a8332a728 100644 --- a/src/ragas/metrics/_aspect_critic.py +++ b/src/ragas/metrics/_aspect_critic.py @@ -29,8 +29,21 @@ class AspectCriticOutput(BaseModel): class AspectCriticInput(BaseModel): - user_input: str = Field(description="The input to the model") - response: str = Field(description="The response from the model") + user_input: t.Optional[str] = Field( + description="The input to the llm system", default=None + ) + response: t.Optional[str] = Field( + description="The response from the llm system", default=None + ) + retrieved_contexts: t.Optional[t.List[str]] = Field( + description="The retrieved contexts from the llm system", default=None + ) + reference_contexts: t.Optional[t.List[str]] = Field( + description="The reference contexts for the evaluation", default=None + ) + reference: t.Optional[str] = Field( + description="The reference answer for evaluation", default=None + ) criteria: str = Field(description="The criteria to evaluate the response") @@ -56,7 +69,7 @@ class SingleTurnAspectCriticPrompt( reason="the criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically correct.", verdict=1, ), - ) + ), ] @@ -102,9 +115,11 @@ class AspectCritic(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { - "user_input", - "response", + "user_input:optional", + "response:optional", "retrieved_contexts:optional", + "reference:optional", + "reference_contexts:optional", }, MetricType.MULTI_TURN: { "user_input", @@ -159,20 +174,18 @@ async def _single_turn_ascore( async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "set LLM before use" - user_input, context, response = ( - row["user_input"], - row.get("retrieved_contexts"), - row["response"], - ) - - if context is not None: - if isinstance(context, list): - context = "\n".join(context) - user_input = f"`user_input`: {user_input} Answer using `retrieved context`: {context}" + user_input = row.get("user_input") + response = row.get("response") + context = row.get("retrieved_contexts") + reference = row.get("reference") + reference_contexts = row.get("reference_contexts") prompt_input = AspectCriticInput( user_input=user_input, response=response, + retrieved_contexts=context, + reference=reference, + reference_contexts=reference_contexts, criteria=self.definition, ) @@ -202,145 +215,6 @@ async def _multi_turn_ascore( return self._compute_score([response]) -class AspectCriticInputWithReference(BaseModel): - user_input: str = Field(description="The input to the model") - response: str = Field(description="The response from the model") - reference: str = Field(description="The reference answer for comparison") - criteria: str = Field(description="The criteria to evaluate the response") - - -class MultiTurnAspectCriticInputWithReference(BaseModel): - user_input: str = Field(description="The input to the model") - reference: str = Field(description="The reference answer for comparison") - criteria: str = Field(description="The criteria to evaluate the response") - - -class AspectCriticOutputWithReference(BaseModel): - reason: str - verdict: int - - -class SingleTurnAspectCriticPromptWithReference( - PydanticPrompt[AspectCriticInputWithReference, AspectCriticOutputWithReference] -): - instruction = "Given an input, response, and reference. Evaluate the submission only using the given criteria. Use only 'Yes' (1) and 'No' (0) as verdict." - input_model = AspectCriticInputWithReference - output_model = AspectCriticOutputWithReference - examples = [ - ( - AspectCriticInputWithReference( - user_input="Who was the director of Los Alamos Laboratory?", - response="Einstein was the director of Los Alamos Laboratory.", - reference="J. Robert Oppenheimer was the director of Los Alamos Laboratory.", - criteria="Is the output written in perfect grammar", - ), - AspectCriticOutputWithReference( - reason="The criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically correct.", - verdict=1, - ), - ) - ] - - -@dataclass -class AspectCriticWithReference(AspectCritic): - """ - AspectCriticWithReference judges the submission to give binary results using the criteria specified - It uses user_input, response and reference to evaluate the submission. - - Attributes - ---------- - name: str - name of the metrics - definition: str - criteria to judge the submission, example "Is the submission spreading - fake information?" - strictness: int - The number of times self consistency checks is made. Final judgement is - made using majority vote. - """ - - _required_columns: t.Dict[MetricType, t.Set[str]] = field( - default_factory=lambda: { - MetricType.SINGLE_TURN: { - "user_input", - "response", - "reference", - "retrieved_contexts:optional", - }, - MetricType.MULTI_TURN: { - "user_input", - "reference", - }, - } - ) - definition: str = field( - default="check if response is similar to reference", repr=True - ) - single_turn_prompt: PydanticPrompt = field( - default_factory=lambda: SingleTurnAspectCriticPromptWithReference() - ) - - multi_turn_prompt: PydanticPrompt = field( - default_factory=lambda: MultiTurnAspectCriticPrompt() - ) - - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: - - if self.llm is None: - raise ValueError("LLM is not set") - - user_input, context, response, reference = ( - row["user_input"], - row.get("retrieved_contexts"), - row["response"], - row["reference"], - ) - - if context is not None: - if isinstance(context, list): - context = "\n".join(context) - user_input = f"`user_input`: {user_input} Answer using `retrieved context`: {context}" - - prompt_input = AspectCriticInputWithReference( - user_input=user_input, - response=response, - reference=reference, - criteria=self.definition, - ) - - response = await self.single_turn_prompt.generate( - data=prompt_input, - llm=self.llm, - callbacks=callbacks, - ) - - return self._compute_score([response]) - - async def _multi_turn_ascore( - self, sample: MultiTurnSample, callbacks: Callbacks - ) -> float: - - if self.llm is None: - raise ValueError("LLM is not set") - - if sample.reference is None: - raise ValueError("Reference is not set") - - interaction = sample.pretty_repr() - prompt_input = MultiTurnAspectCriticInputWithReference( - user_input=interaction, - reference=sample.reference, - criteria=self.definition, - ) - response = await self.multi_turn_prompt.generate( - data=prompt_input, - llm=self.llm, - callbacks=callbacks, - ) - return self._compute_score([response]) - - harmfulness = AspectCritic( name="harmfulness", definition="Does the submission cause or have the potential to cause harm to individuals, groups, or society at large?", # noqa: E501 diff --git a/src/ragas/prompt/pydantic_prompt.py b/src/ragas/prompt/pydantic_prompt.py index 950252ec8..4b07d13a5 100644 --- a/src/ragas/prompt/pydantic_prompt.py +++ b/src/ragas/prompt/pydantic_prompt.py @@ -55,10 +55,10 @@ def _generate_examples(self): self.instruction + "\n" + "input: " - + input_data.model_dump_json(indent=4) + + input_data.model_dump_json(indent=4, exclude_none=True) + "\n" + "output: " - + output_data.model_dump_json(indent=4) + + output_data.model_dump_json(indent=4, exclude_none=True) ) return ( @@ -78,7 +78,7 @@ def to_string(self, data: t.Optional[InputModel] = None) -> str: + self._generate_examples() + "\nNow perform the above instruction with the following input\n" + ( - "input: " + data.model_dump_json(indent=4) + "\n" + "input: " + data.model_dump_json(indent=4, exclude_none=True) + "\n" if data is not None else "input: (None)\n" ) From b6e35c206b4d3556c7c95302e7eb6ded4827bdfb Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 13 Nov 2024 12:49:23 +0530 Subject: [PATCH 02/13] feat: make rubric generalisable --- src/ragas/metrics/__init__.py | 16 +- src/ragas/metrics/_domain_specific_rubrics.py | 248 ++++-------------- .../metrics/_instance_specific_rubrics.py | 91 +------ 3 files changed, 69 insertions(+), 286 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index ebf92ebbc..313db3ce5 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -9,7 +9,7 @@ SemanticSimilarity, answer_similarity, ) -from ragas.metrics._aspect_critic import AspectCritic, AspectCriticWithReference +from ragas.metrics._aspect_critic import AspectCritic from ragas.metrics._bleu_score import BleuScore from ragas.metrics._context_entities_recall import ( ContextEntityRecall, @@ -31,8 +31,7 @@ ) from ragas.metrics._datacompy_score import DataCompyScore from ragas.metrics._domain_specific_rubrics import ( - RubricsScoreWithoutReference, - RubricsScoreWithReference, + RubricsScore, ) from ragas.metrics._factual_correctness import FactualCorrectness from ragas.metrics._faithfulness import Faithfulness, FaithfulnesswithHHEM, faithfulness @@ -41,8 +40,7 @@ AgentGoalAccuracyWithReference, ) from ragas.metrics._instance_specific_rubrics import ( - InstanceRubricsScoreWithoutReference, - InstanceRubricsWithReference, + InstanceRubrics, ) from ragas.metrics._multi_modal_faithfulness import ( MultiModalFaithfulness, @@ -79,7 +77,6 @@ "ContextRecall", "context_recall", "AspectCritic", - "AspectCriticWithReference", "AnswerRelevancy", "answer_relevancy", "ContextEntityRecall", @@ -87,8 +84,7 @@ "SummarizationScore", "summarization_score", "NoiseSensitivity", - "RubricsScoreWithoutReference", - "RubricsScoreWithReference", + "RubricsScore", "LLMContextPrecisionWithReference", "LLMContextPrecisionWithoutReference", "NonLLMContextPrecisionWithReference", @@ -96,8 +92,7 @@ "LLMContextRecall", "NonLLMContextRecall", "FactualCorrectness", - "InstanceRubricsScoreWithoutReference", - "InstanceRubricsWithReference", + "InstanceRubrics", "NonLLMStringSimilarity", "ExactMatch", "StringPresence", @@ -117,5 +112,4 @@ "multimodal_faithness", "MultiModalRelevance", "multimodal_relevance", - "AspectCriticWithReference", ] diff --git a/src/ragas/metrics/_domain_specific_rubrics.py b/src/ragas/metrics/_domain_specific_rubrics.py index 2ec530e8e..7a80b1b34 100644 --- a/src/ragas/metrics/_domain_specific_rubrics.py +++ b/src/ragas/metrics/_domain_specific_rubrics.py @@ -43,28 +43,42 @@ class ScoreFeedback(BaseModel): score: int = Field(..., description="The score given to the response") -class SingleTurnWithoutReferenceInput(BaseModel): - user_input: str = Field(..., description="The user input") - response: str = Field(..., description="The response") +class SingleTurnInput(BaseModel): + user_input: t.Optional[str] = Field( + description="The input to the llm system", default=None + ) + response: t.Optional[str] = Field( + description="The response from the llm system", default=None + ) + retrieved_contexts: t.Optional[t.List[str]] = Field( + description="The retrieved contexts from the llm system", default=None + ) + reference_contexts: t.Optional[t.List[str]] = Field( + description="The reference contexts for the evaluation", default=None + ) + reference: t.Optional[str] = Field( + description="The reference answer for evaluation", default=None + ) rubrics: t.Dict[str, str] = Field(..., description="The rubric") -class MultiTurnWithoutReferenceInput(BaseModel): - user_input: str = Field(..., description="The user input") +class MultiTurnInput(BaseModel): + user_input: t.Optional[str] = Field(description="The user input", default=None) + reference: t.Optional[str] = Field( + description="The reference answer for evaluation", default=None + ) rubrics: t.Dict[str, str] = Field(..., description="The rubric") -class SingleTurnWithoutReferencePrompt( - PydanticPrompt[SingleTurnWithoutReferenceInput, ScoreFeedback] -): +class SingleTurnPrompt(PydanticPrompt[SingleTurnInput, ScoreFeedback]): instruction = """Given an user_input (which might contain an input along with it), a response to evaluate, and a score rubric representing evaluation criteria are given. 1. Write detailed feedback that assesses the quality of the response strictly based on the given score rubric, without evaluating in general. 2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""" - input_model = SingleTurnWithoutReferenceInput + input_model = SingleTurnInput output_model = ScoreFeedback examples = [ ( - SingleTurnWithoutReferenceInput( + SingleTurnInput( user_input="What is the capital of France?", response="The capital of France is Paris.", rubrics=DEFAULT_REFERENCE_FREE_RUBRICS, @@ -77,17 +91,15 @@ class SingleTurnWithoutReferencePrompt( ] -class MultiTurnWithoutReferencePrompt( - PydanticPrompt[MultiTurnWithoutReferenceInput, ScoreFeedback] -): +class MultiTurnPrompt(PydanticPrompt[MultiTurnInput, ScoreFeedback]): instruction = """Given an interaction between AI,Human and external Tool as input and reference that's desired outcome that get's a score of 5,and a score rubric representing evaluation criteria are given. 1. Write detailed feedback that assesses the quality of the responselet strictly based on the given score rubric, without evaluating in general. 2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""" - input_model = MultiTurnWithoutReferenceInput + input_model = MultiTurnInput output_model = ScoreFeedback examples = [ ( - MultiTurnWithoutReferenceInput( + MultiTurnInput( user_input="""Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\nAI: Sure, let me find the best options for you.\nTools:\n restaurant_search: {'cuisine': 'Chinese', 'time': '8:00pm'}\nToolOutput: Found a few options: 1. Golden Dragon, 2. Jade Palace\nAI: I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\nHuman: Let's go with Golden Dragon.\nAI: Great choice! I'll book a table for 8:00pm at Golden Dragon.\nTools:\n restaurant_book: {'name': 'Golden Dragon', 'time': '8:00pm'}\nToolOutput: Table booked at Golden Dragon for 8:00pm.\nAI: Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\nHuman: thanks""", rubrics=DEFAULT_REFERENCE_FREE_RUBRICS, ), @@ -97,17 +109,19 @@ class MultiTurnWithoutReferencePrompt( @dataclass -class RubricsScoreWithoutReference(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): - name: str = "rubrics_score_without_reference" +class RubricsScore(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): + name: str = "rubrics_score" _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { - "user_input", - "response", + "user_input:optional", + "response:optional", "retrieved_contexts:optional", + "reference:optional", + "reference_contexts:optional", }, MetricType.MULTI_TURN: { - "user_input", + "user_input:optional", }, }, repr=False, @@ -116,12 +130,12 @@ class RubricsScoreWithoutReference(MetricWithLLM, SingleTurnMetric, MultiTurnMet default_factory=lambda: DEFAULT_REFERENCE_FREE_RUBRICS ) max_retries: int = 1 - single_turn_scoring_prompt: PydanticPrompt[ - SingleTurnWithoutReferenceInput, ScoreFeedback - ] = field(default_factory=SingleTurnWithoutReferencePrompt, repr=False) - multi_turn_scoring_prompt: PydanticPrompt[ - MultiTurnWithoutReferenceInput, ScoreFeedback - ] = field(default_factory=MultiTurnWithoutReferencePrompt, repr=False) + single_turn_scoring_prompt: PydanticPrompt[SingleTurnInput, ScoreFeedback] = field( + default_factory=SingleTurnPrompt, repr=False + ) + multi_turn_scoring_prompt: PydanticPrompt[MultiTurnInput, ScoreFeedback] = field( + default_factory=MultiTurnPrompt, repr=False + ) async def _single_turn_ascore( self, sample: SingleTurnSample, callbacks: Callbacks @@ -131,148 +145,20 @@ async def _single_turn_ascore( async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not set" - prompt_input = self._create_single_turn_prompt(row) - output = await self.single_turn_scoring_prompt.generate( - data=prompt_input, - llm=self.llm, - callbacks=callbacks, - ) - return output.score - - async def _multi_turn_ascore( - self, sample: MultiTurnSample, callbacks: Callbacks - ) -> float: - assert self.llm is not None, "LLM is not set" - - interaction = sample.pretty_repr() - prompt_input = MultiTurnWithoutReferenceInput( - user_input=interaction, - rubrics=self.rubrics, - ) - output = await self.multi_turn_scoring_prompt.generate( - data=prompt_input, - llm=self.llm, - callbacks=callbacks, - ) - return output.score - - def _create_single_turn_prompt( - self, row: t.Dict - ) -> SingleTurnWithoutReferenceInput: - question, contexts, answer = ( - row["user_input"], - row.get("retrieved_contexts"), - row["response"], - ) - if contexts: - contexts = "\n".join(contexts) - question = f"{question} answer using context: {contexts}" + user_input = row.get("user_input") + reference = row.get("reference") + reference_contexts = row.get("reference_contexts") + response = row.get("response") + retrieved_contexts = row.get("retrieved_contexts") - return SingleTurnWithoutReferenceInput( - user_input=question, - response=answer, + prompt_input = SingleTurnInput( + user_input=user_input, + response=response, + retrieved_contexts=retrieved_contexts, + reference=reference, + reference_contexts=reference_contexts, rubrics=self.rubrics, ) - - -class SingleTurnWithReferenceInput(BaseModel): - user_input: str = Field(..., description="The user input") - response: str = Field(..., description="The response") - reference: str = Field(..., description="The reference") - rubrics: t.Dict[str, str] = Field(..., description="The rubric") - - -class MultiTurnWithReferenceInput(BaseModel): - user_input: str = Field(..., description="The user input") - reference: str = Field(..., description="The reference") - rubrics: t.Dict[str, str] = Field(..., description="The rubric") - - -class SingleTurnWithReferencePrompt( - PydanticPrompt[SingleTurnWithReferenceInput, ScoreFeedback] -): - instruction = """Given user input, response and reference that's desired outcome that get's a score of 5,and a score rubric representing evaluation criteria are given. - 1. Write detailed feedback that assesses the quality of the response strictly based on the given score rubric, without evaluating in general. - 2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""" - input_model = SingleTurnWithReferenceInput - output_model = ScoreFeedback - examples = [ - ( - SingleTurnWithReferenceInput( - user_input="What is the capital of France?", - response="The capital of France is Paris.", - reference="The capital of France is Paris.", - rubrics=DEFAULT_WITH_REFERENCE_RUBRICS, - ), - ScoreFeedback( - feedback="The response is accurate and provides the correct answer to the question. The language is clear and concise, making it easy to understand. However, additional details could be included to enhance the response.", - score=5, - ), - ) - ] - - -class MultiTurnWithReferencePrompt( - PydanticPrompt[MultiTurnWithReferenceInput, ScoreFeedback] -): - instruction = """Given an interaction between AI,Human and external Tool as input and reference that's desired outcome that get's a score of 5,and a score rubric representing evaluation criteria are given. - 1. Write detailed feedback that assesses the quality of the responselet strictly based on the given score rubric, without evaluating in general. - 2. After writing the feedback, assign a score between 1 and 5, referring to the score rubric.""" - input_model = MultiTurnWithReferenceInput - output_model = ScoreFeedback - examples = [ - ( - MultiTurnWithReferenceInput( - user_input="""Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\nAI: Sure, let me find the best options for you.\nTools:\n restaurant_search: {'cuisine': 'Chinese', 'time': '8:00pm'}\nToolOutput: Found a few options: 1. Golden Dragon, 2. Jade Palace\nAI: I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\nHuman: Let's go with Golden Dragon.\nAI: Great choice! I'll book a table for 8:00pm at Golden Dragon.\nTools:\n restaurant_book: {'name': 'Golden Dragon', 'time': '8:00pm'}\nToolOutput: Table booked at Golden Dragon for 8:00pm.\nAI: Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\nHuman: thanks""", - reference="The AI successfully books a table at the nearest best Chinese restaurant for 8:00pm, providing the user with options and confirming the booking.", - rubrics=DEFAULT_WITH_REFERENCE_RUBRICS, - ), - ScoreFeedback( - feedback="The AI successfully books a table at the nearest best Chinese restaurant for 8:00pm, providing the user with options and confirming the booking. The response is clear, accurate, and meets all the criteria for a score of 5 based on the rubric.", - score=5, - ), - ) - ] - - -@dataclass -class RubricsScoreWithReference(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): - name: str = "rubrics_score_with_reference" - _required_columns: t.Dict[MetricType, t.Set[str]] = field( - default_factory=lambda: { - MetricType.SINGLE_TURN: { - "user_input", - "response", - "retrieved_contexts:optional", - "reference", - }, - MetricType.MULTI_TURN: { - "user_input", - "reference", - }, - }, - repr=False, - ) - rubrics: t.Dict[str, str] = field( - default_factory=lambda: DEFAULT_WITH_REFERENCE_RUBRICS - ) - max_retries: int = 1 - single_turn_scoring_prompt: PydanticPrompt[ - SingleTurnWithReferenceInput, ScoreFeedback - ] = field(default_factory=SingleTurnWithReferencePrompt, repr=False) - multi_turn_scoring_prompt: PydanticPrompt[ - MultiTurnWithReferenceInput, ScoreFeedback - ] = field(default_factory=MultiTurnWithReferencePrompt, repr=False) - - async def _single_turn_ascore( - self, sample: SingleTurnSample, callbacks: Callbacks - ) -> float: - return await self._ascore(sample.to_dict(), callbacks) - - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: - assert self.llm is not None, "LLM is not set" - - prompt_input = self._create_single_turn_prompt(row) output = await self.single_turn_scoring_prompt.generate( data=prompt_input, llm=self.llm, @@ -286,37 +172,13 @@ async def _multi_turn_ascore( assert self.llm is not None, "LLM is not set" interaction = sample.pretty_repr() - row = {"interaction": interaction, "reference": sample.reference} - prompt_input = self._create_multi_turn_prompt(row) + prompt_input = MultiTurnInput( + user_input=interaction, + rubrics=self.rubrics, + ) output = await self.multi_turn_scoring_prompt.generate( data=prompt_input, llm=self.llm, callbacks=callbacks, ) return output.score - - def _create_multi_turn_prompt(self, row: t.Dict) -> MultiTurnWithReferenceInput: - interaction, reference = row["interaction"], row["reference"] - return MultiTurnWithReferenceInput( - user_input=interaction, - reference=reference, - rubrics=self.rubrics, - ) - - def _create_single_turn_prompt(self, row: t.Dict) -> SingleTurnWithReferenceInput: - question, contexts, answer, ground_truth = ( - row["user_input"], - row.get("retrieved_contexts"), - row["response"], - row["reference"], - ) - if contexts: - contexts = "\n".join(contexts) - question = f"{question} answer using context: {contexts}" - - return SingleTurnWithReferenceInput( - user_input=question, - response=answer, - reference=ground_truth, - rubrics=self.rubrics, - ) diff --git a/src/ragas/metrics/_instance_specific_rubrics.py b/src/ragas/metrics/_instance_specific_rubrics.py index 060d93dfb..f5249ded9 100644 --- a/src/ragas/metrics/_instance_specific_rubrics.py +++ b/src/ragas/metrics/_instance_specific_rubrics.py @@ -5,13 +5,10 @@ from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.metrics._domain_specific_rubrics import ( - MultiTurnWithoutReferenceInput, - MultiTurnWithoutReferencePrompt, - MultiTurnWithReferenceInput, - SingleTurnWithoutReferenceInput, - SingleTurnWithoutReferencePrompt, - SingleTurnWithReferenceInput, - SingleTurnWithReferencePrompt, + MultiTurnInput, + MultiTurnPrompt, + SingleTurnInput, + SingleTurnPrompt, ) from ragas.metrics.base import ( MetricType, @@ -26,7 +23,7 @@ @dataclass -class InstanceRubricsWithReference(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): +class InstanceRubrics(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): name: str = "labelled_rubrics_score" _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: { @@ -35,11 +32,9 @@ class InstanceRubricsWithReference(MetricWithLLM, SingleTurnMetric, MultiTurnMet } ) single_turn_prompt: PydanticPrompt = field( - default_factory=lambda: SingleTurnWithReferencePrompt() - ) - multi_turn_prompt: PydanticPrompt = field( - default_factory=lambda: MultiTurnWithoutReferencePrompt() + default_factory=lambda: SingleTurnPrompt() ) + multi_turn_prompt: PydanticPrompt = field(default_factory=lambda: MultiTurnPrompt()) max_retries: int = 1 @@ -57,7 +52,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: contexts = "\n".join(contexts) user_input = f"{user_input} answer using context: {contexts}" - prompt_input = SingleTurnWithReferenceInput( + prompt_input = SingleTurnInput( user_input=user_input, response=response, reference=reference, @@ -85,7 +80,7 @@ async def _multi_turn_ascore( interaction = sample.pretty_repr() reference = sample.reference rubrics = sample.rubrics - prompt_input = MultiTurnWithReferenceInput( + prompt_input = MultiTurnInput( user_input=interaction, reference=reference, rubrics=rubrics, @@ -96,71 +91,3 @@ async def _multi_turn_ascore( callbacks=callbacks, ) return output.score - - -@dataclass -class InstanceRubricsScoreWithoutReference( - MetricWithLLM, SingleTurnMetric, MultiTurnMetric -): - name: str = "reference_free_rubrics_score" - _required_columns: t.Dict[MetricType, t.Set[str]] = field( - default_factory=lambda: { - MetricType.SINGLE_TURN: {"user_input", "response", "rubrics"}, - MetricType.MULTI_TURN: {"user_input", "rubrics"}, - } - ) - single_turn_prompt: PydanticPrompt = field( - default_factory=lambda: SingleTurnWithoutReferencePrompt() - ) - multi_turn_prompt: PydanticPrompt = field( - default_factory=lambda: MultiTurnWithoutReferencePrompt() - ) - max_retries: int = 1 - - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: - assert self.llm is not None, "LLM is not set" - - user_input, contexts, response, rubrics = ( - row["user_input"], - row.get("retrieved_contexts"), - row["response"], - row["rubrics"], - ) - if contexts is not None: - contexts = "\n".join(contexts) - user_input = f"{user_input} answer using context: {contexts}" - - prompt_input = SingleTurnWithoutReferenceInput( - user_input=user_input, - response=response, - rubrics=rubrics, - ) - - response = await self.single_turn_prompt.generate( - data=prompt_input, llm=self.llm, callbacks=callbacks - ) - return response.score - - async def _single_turn_ascore( - self, sample: SingleTurnSample, callbacks: Callbacks - ) -> float: - row = sample.to_dict() - return await self._ascore(row, callbacks) - - async def _multi_turn_ascore( - self, sample: MultiTurnSample, callbacks: Callbacks - ) -> float: - assert self.llm is not None, "LLM is not set" - assert sample.rubrics is not None, "Rubrics are not set" - interaction = sample.pretty_repr() - rubrics = sample.rubrics - prompt_input = MultiTurnWithoutReferenceInput( - user_input=interaction, - rubrics=rubrics, - ) - output = await self.multi_turn_prompt.generate( - data=prompt_input, - llm=self.llm, - callbacks=callbacks, - ) - return output.score From 8d353cd1dd5b67024815075d062ff3a95e2d6263 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 13 Nov 2024 12:59:03 +0530 Subject: [PATCH 03/13] fix: import error and more examples --- src/ragas/metrics/_domain_specific_rubrics.py | 14 +++++++++++++- .../metrics/_instance_specific_rubrics.py | 18 +++++++++++++++--- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/ragas/metrics/_domain_specific_rubrics.py b/src/ragas/metrics/_domain_specific_rubrics.py index 7a80b1b34..129833910 100644 --- a/src/ragas/metrics/_domain_specific_rubrics.py +++ b/src/ragas/metrics/_domain_specific_rubrics.py @@ -104,7 +104,18 @@ class MultiTurnPrompt(PydanticPrompt[MultiTurnInput, ScoreFeedback]): rubrics=DEFAULT_REFERENCE_FREE_RUBRICS, ), ScoreFeedback(feedback="", score=5), - ) + ), + ( + MultiTurnInput( + user_input="""Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\nAI: Sure, let me find the best options for you.\nTools:\n restaurant_search: {'cuisine': 'Chinese', 'time': '8:00pm'}\nToolOutput: Found a few options: 1. Golden Dragon, 2. Jade Palace\nAI: I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\nHuman: Let's go with Golden Dragon.\nAI: Great choice! I'll book a table for 8:00pm at Golden Dragon.\nTools:\n restaurant_book: {'name': 'Golden Dragon', 'time': '8:00pm'}\nToolOutput: Table booked at Golden Dragon for 8:00pm.\nAI: Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\nHuman: thanks""", + reference="The AI successfully books a table at the nearest best Chinese restaurant for 8:00pm, providing the user with options and confirming the booking.", + rubrics=DEFAULT_WITH_REFERENCE_RUBRICS, + ), + ScoreFeedback( + feedback="The AI successfully books a table at the nearest best Chinese restaurant for 8:00pm, providing the user with options and confirming the booking. The response is clear, accurate, and meets all the criteria for a score of 5 based on the rubric.", + score=5, + ), + ), ] @@ -122,6 +133,7 @@ class RubricsScore(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): }, MetricType.MULTI_TURN: { "user_input:optional", + "reference:optional", }, }, repr=False, diff --git a/src/ragas/metrics/_instance_specific_rubrics.py b/src/ragas/metrics/_instance_specific_rubrics.py index f5249ded9..f26609e62 100644 --- a/src/ragas/metrics/_instance_specific_rubrics.py +++ b/src/ragas/metrics/_instance_specific_rubrics.py @@ -27,9 +27,21 @@ class InstanceRubrics(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): name: str = "labelled_rubrics_score" _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: { - MetricType.SINGLE_TURN: {"user_input", "response", "reference", "rubrics"}, - MetricType.MULTI_TURN: {"user_input", "reference", "rubrics"}, - } + MetricType.SINGLE_TURN: { + "rubrics", + "user_input:optional", + "response:optional", + "retrieved_contexts:optional", + "reference:optional", + "reference_contexts:optional", + }, + MetricType.MULTI_TURN: { + "rubrics", + "user_input:optional", + "reference:optional", + }, + }, + repr=False, ) single_turn_prompt: PydanticPrompt = field( default_factory=lambda: SingleTurnPrompt() From f50eb5c4bf75b4c2b8ddac33b3a598422c958eec Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 13 Nov 2024 13:08:19 +0530 Subject: [PATCH 04/13] feat: made simple criteria generalisable --- src/ragas/metrics/__init__.py | 8 +- src/ragas/metrics/_simple_criteria.py | 162 ++++++-------------------- 2 files changed, 37 insertions(+), 133 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 313db3ce5..761023fb3 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -30,18 +30,14 @@ context_recall, ) from ragas.metrics._datacompy_score import DataCompyScore -from ragas.metrics._domain_specific_rubrics import ( - RubricsScore, -) +from ragas.metrics._domain_specific_rubrics import RubricsScore from ragas.metrics._factual_correctness import FactualCorrectness from ragas.metrics._faithfulness import Faithfulness, FaithfulnesswithHHEM, faithfulness from ragas.metrics._goal_accuracy import ( AgentGoalAccuracyWithoutReference, AgentGoalAccuracyWithReference, ) -from ragas.metrics._instance_specific_rubrics import ( - InstanceRubrics, -) +from ragas.metrics._instance_specific_rubrics import InstanceRubrics from ragas.metrics._multi_modal_faithfulness import ( MultiModalFaithfulness, multimodal_faithness, diff --git a/src/ragas/metrics/_simple_criteria.py b/src/ragas/metrics/_simple_criteria.py index 1dfc1c1cb..968e694e6 100644 --- a/src/ragas/metrics/_simple_criteria.py +++ b/src/ragas/metrics/_simple_criteria.py @@ -29,24 +29,34 @@ class SimpleCriteriaOutput(BaseModel): class SingleTurnSimpleCriteriaInput(BaseModel): - user_input: str = Field(description="The input to the model") - response: str = Field(description="The response from the model") + user_input: t.Optional[str] = Field( + description="The input to the llm system", default=None + ) + response: t.Optional[str] = Field( + description="The response from the llm system", default=None + ) + retrieved_contexts: t.Optional[t.List[str]] = Field( + description="The retrieved contexts from the llm system", default=None + ) + reference_contexts: t.Optional[t.List[str]] = Field( + description="The reference contexts for the evaluation", default=None + ) + reference: t.Optional[str] = Field( + description="The reference answer for evaluation", default=None + ) criteria: str = Field(description="The criteria to evaluate the response") -class SingleTurnSimpleCriteriaWithReferenceInput(SingleTurnSimpleCriteriaInput): - reference: str = Field(description="The reference response") - - class MultiTurnSimpleCriteriaInput(BaseModel): - user_input: str = Field(description="The input to the model") + user_input: t.Optional[str] = Field( + description="The input to the model", default=None + ) + reference: t.Optional[str] = Field( + description="The reference response", default=None + ) criteria: str = Field(description="The criteria to evaluate the response") -class MultiTurnSimpleCriteriaWithReferenceInput(MultiTurnSimpleCriteriaInput): - reference: str = Field(description="The reference response") - - class SingleTurnSimpleCriteriaPrompt( PydanticPrompt[SingleTurnSimpleCriteriaInput, SimpleCriteriaOutput] ): @@ -64,19 +74,9 @@ class SingleTurnSimpleCriteriaPrompt( reason="The response is grammatically correct and relevant to the input.", score=5, ), - ) - ] - - -class SingleTurnSimpleCriteriaWithReferencePrompt( - PydanticPrompt[SingleTurnSimpleCriteriaWithReferenceInput, SimpleCriteriaOutput] -): - instruction = "Given a input, system response and reference. Evaluate and score the response against the reference only using the given criteria." - input_model = SingleTurnSimpleCriteriaWithReferenceInput - output_model = SimpleCriteriaOutput - examples = [ + ), ( - SingleTurnSimpleCriteriaWithReferenceInput( + SingleTurnSimpleCriteriaInput( user_input="Who was the director of Los Alamos Laboratory?", response="Einstein was the director of Los Alamos Laboratory.", reference="The director of Los Alamos Laboratory was J. Robert Oppenheimer.", @@ -86,7 +86,7 @@ class SingleTurnSimpleCriteriaWithReferencePrompt( reason="The response and reference have two very different answers.", score=0, ), - ) + ), ] @@ -106,19 +106,9 @@ class MultiTurnSimpleCriteriaPrompt( reason="The interaction is coherent and relevant to the user's request.", score=5, ), - ) - ] - - -class MultiTurnSimpleCriteriaWithReferencePrompt( - PydanticPrompt[MultiTurnSimpleCriteriaWithReferenceInput, SimpleCriteriaOutput] -): - instruction = "Given an interaction between Human, AI and Tools evaluate and score the interaction using the given criteria." - input_model = MultiTurnSimpleCriteriaWithReferenceInput - output_model = SimpleCriteriaOutput - examples = [ + ), ( - MultiTurnSimpleCriteriaWithReferenceInput( + MultiTurnSimpleCriteriaInput( user_input="""Human: Hey, book a table at the nearest best Chinese restaurant for 8:00pm\nAI: Sure, let me find the best options for you.\nTools:\n restaurant_search: {'cuisine': 'Chinese', 'time': '8:00pm'}\nToolOutput: Found a few options: 1. Golden Dragon, 2. Jade Palace\nAI: I found some great options: Golden Dragon and Jade Palace. Which one would you prefer?\nHuman: Let's go with Golden Dragon.\nAI: Great choice! I'll book a table for 8:00pm at Golden Dragon.\nTools:\n restaurant_book: {'name': 'Golden Dragon', 'time': '8:00pm'}\nToolOutput: Table booked at Golden Dragon for 8:00pm.\nAI: Your table at Golden Dragon is booked for 8:00pm. Enjoy your meal!\nHuman: thanks""", reference="The AI successfully books a table at the nearest best Chinese restaurant for 8:00pm, providing the user with options and confirming the booking.", criteria="Score the interaction in range of 0 to 5 based on factors such as helpfulness, coherence, and relevance.", @@ -127,25 +117,12 @@ class MultiTurnSimpleCriteriaWithReferencePrompt( reason="The interaction is coherent and relevant to the user's request.", score=5, ), - ) + ), ] -class SimpleCriteriaOutout(BaseModel): - reason: str = Field(description="Reason for the score") - score: int = Field(description="The score for the submission") - - -class SimpleCriteriaWithoutReferenceInput(BaseModel): - user_input: str = Field(description="The input to the model") - response: str = Field(description="The response from the model") - criteria: str = Field(description="The criteria to evaluate the response") - - @dataclass -class SimpleCriteriaScoreWithoutReference( - MetricWithLLM, SingleTurnMetric, MultiTurnMetric -): +class SimpleCriteriaScore(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): """ Judges the submission to give binary results using the criteria specified in the metric definition. @@ -165,11 +142,15 @@ class SimpleCriteriaScoreWithoutReference( _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: { MetricType.SINGLE_TURN: { - "user_input", - "response", + "user_input:optional", + "response:optional", + "retrieved_contexts:optional", + "reference:optional", + "reference_contexts:optional", }, MetricType.MULTI_TURN: { - "user_input", + "user_input:optional", + "reference:optional", }, } ) @@ -257,76 +238,3 @@ async def _multi_turn_ascore( callbacks=callbacks, ) return self._compute_score([response]) - - -@dataclass -class SimpleCriteriaScoreWithReference(SimpleCriteriaScoreWithoutReference): - name: str = field(default="", repr=True) - _required_columns: t.Dict[MetricType, t.Set[str]] = field( - default_factory=lambda: { - MetricType.SINGLE_TURN: { - "user_input", - "response", - "reference", - }, - MetricType.MULTI_TURN: { - "user_input", - "reference", - }, - } - ) - single_turn_prompt: PydanticPrompt = field( - default_factory=lambda: SingleTurnSimpleCriteriaWithReferencePrompt() - ) - multi_turn_prompt: PydanticPrompt = field( - default_factory=lambda: MultiTurnSimpleCriteriaWithReferencePrompt() - ) - - async def _single_turn_ascore( - self, sample: SingleTurnSample, callbacks: Callbacks - ) -> float: - assert self.llm is not None, "LLM is not set" - assert sample.user_input is not None, "User input is not set" - assert sample.reference is not None, "Reference is not set" - assert sample.response is not None, "Response is not set" - - prompt_input = SingleTurnSimpleCriteriaWithReferenceInput( - user_input=sample.user_input, - response=sample.response, - reference=sample.reference, - criteria=self.definition, - ) - - response = await self.single_turn_prompt.generate( - data=prompt_input, - llm=self.llm, - callbacks=callbacks, - ) - - return self._compute_score([response]) - - async def _multi_turn_ascore( - self, sample: MultiTurnSample, callbacks: Callbacks - ) -> float: - assert self.llm is not None, "LLM is not set" - assert sample.user_input is not None, "User input is not set" - assert sample.reference is not None, "Reference is not set" - - interaction = sample.pretty_repr() - prompt_input = MultiTurnSimpleCriteriaWithReferenceInput( - user_input=interaction, - reference=sample.reference, - criteria=self.definition, - ) - - response = await self.multi_turn_prompt.generate( - data=prompt_input, - llm=self.llm, - callbacks=callbacks, - ) - - return self._compute_score([response]) - - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: - sample = SingleTurnSample(**row) - return await self._single_turn_ascore(sample, callbacks) From 5eed4d97b72e22a3c2a929679920f6ebae8e911d Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 13 Nov 2024 13:08:35 +0530 Subject: [PATCH 05/13] fmt: format changes --- docs/howtos/customizations/metrics/cost.ipynb | 12 +- .../testgenerator/persona_generator.ipynb | 17 +- docs/howtos/integrations/langchain.ipynb | 29 +- .../langgraph_agent_evaluation.ipynb | 1546 ++++++++--------- src/ragas/metrics/_noise_sensitivity.py | 5 +- src/ragas/testset/synthesizers/generate.py | 5 +- src/ragas/testset/transforms/__init__.py | 10 +- src/ragas/testset/transforms/default.py | 4 +- .../transforms/extractors/llm_based.py | 4 +- src/ragas/utils.py | 1 + 10 files changed, 836 insertions(+), 797 deletions(-) diff --git a/docs/howtos/customizations/metrics/cost.ipynb b/docs/howtos/customizations/metrics/cost.ipynb index f317a6123..d8d98ad51 100644 --- a/docs/howtos/customizations/metrics/cost.ipynb +++ b/docs/howtos/customizations/metrics/cost.ipynb @@ -29,6 +29,7 @@ "outputs": [], "source": [ "import os\n", + "\n", "os.environ[\"OPENAI_API_KEY\"] = \"your-api-key\"" ] }, @@ -105,8 +106,7 @@ "metric = AspectCriticWithReference(\n", " name=\"answer_correctness\",\n", " definition=\"is the response correct compared to reference\",\n", - ")\n", - "\n" + ")" ] }, { @@ -126,8 +126,12 @@ "from ragas import evaluate\n", "from ragas.cost import get_token_usage_for_openai\n", "\n", - "results = evaluate(eval_dataset[:5], metrics=[metric], llm=gpt4o,\n", - " token_usage_parser=get_token_usage_for_openai,)" + "results = evaluate(\n", + " eval_dataset[:5],\n", + " metrics=[metric],\n", + " llm=gpt4o,\n", + " token_usage_parser=get_token_usage_for_openai,\n", + ")" ] }, { diff --git a/docs/howtos/customizations/testgenerator/persona_generator.ipynb b/docs/howtos/customizations/testgenerator/persona_generator.ipynb index 7ed8e7744..c29d8a0fc 100644 --- a/docs/howtos/customizations/testgenerator/persona_generator.ipynb +++ b/docs/howtos/customizations/testgenerator/persona_generator.ipynb @@ -38,9 +38,18 @@ "source": [ "from ragas.testset.persona import Persona\n", "\n", - "persona_new_joinee = Persona(name=\"New Joinee\", role_description=\"Don't know much about the company and is looking for information on how to get started.\")\n", - "persona_manager = Persona(name=\"Manager\", role_description=\"Wants to know about the different teams and how they collaborate with each other.\")\n", - "persona_senior_manager = Persona(name=\"Senior Manager\", role_description=\"Wants to know about the company vision and how it is executed.\")\n", + "persona_new_joinee = Persona(\n", + " name=\"New Joinee\",\n", + " role_description=\"Don't know much about the company and is looking for information on how to get started.\",\n", + ")\n", + "persona_manager = Persona(\n", + " name=\"Manager\",\n", + " role_description=\"Wants to know about the different teams and how they collaborate with each other.\",\n", + ")\n", + "persona_senior_manager = Persona(\n", + " name=\"Senior Manager\",\n", + " role_description=\"Wants to know about the company vision and how it is executed.\",\n", + ")\n", "\n", "personas = [persona_new_joinee, persona_manager, persona_senior_manager]\n", "personas" @@ -72,7 +81,7 @@ "testset_generator = TestsetGenerator(knowledge_graph=kg, persona_list=personas, llm=llm)\n", "# Generate the Testset\n", "testset = testset_generator.generate(testset_size=10)\n", - "testset\n" + "testset" ] }, { diff --git a/docs/howtos/integrations/langchain.ipynb b/docs/howtos/integrations/langchain.ipynb index 0136d9db0..5e83b0890 100644 --- a/docs/howtos/integrations/langchain.ipynb +++ b/docs/howtos/integrations/langchain.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "fb5deb25", "metadata": {}, "outputs": [], @@ -59,10 +59,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "4aa9a986", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jjmachan/.pyenv/versions/ragas/lib/python3.10/site-packages/langchain/indexes/vectorstore.py:128: UserWarning: Using InMemoryVectorStore as the default vectorstore.This memory store won't persist data. You should explicitlyspecify a vectorstore when using VectorstoreIndexCreator\n", + " warnings.warn(\n" + ] + }, + { + "ename": "ValidationError", + "evalue": "1 validation error for VectorstoreIndexCreator\nembedding\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.9/v/missing", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain_openai\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChatOpenAI\n\u001b[1;32m 6\u001b[0m loader \u001b[38;5;241m=\u001b[39m TextLoader(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./nyc_wikipedia/nyc_text.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 7\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[43mVectorstoreIndexCreator\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mfrom_loaders([loader])\n\u001b[1;32m 10\u001b[0m llm \u001b[38;5;241m=\u001b[39m ChatOpenAI(temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 11\u001b[0m qa_chain \u001b[38;5;241m=\u001b[39m RetrievalQA\u001b[38;5;241m.\u001b[39mfrom_chain_type(\n\u001b[1;32m 12\u001b[0m llm,\n\u001b[1;32m 13\u001b[0m retriever\u001b[38;5;241m=\u001b[39mindex\u001b[38;5;241m.\u001b[39mvectorstore\u001b[38;5;241m.\u001b[39mas_retriever(),\n\u001b[1;32m 14\u001b[0m return_source_documents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 15\u001b[0m )\n", + "File \u001b[0;32m~/.pyenv/versions/ragas/lib/python3.10/site-packages/pydantic/main.py:212\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 211\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 212\u001b[0m validated_self \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m validated_self:\n\u001b[1;32m 214\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 215\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mA custom validator is returning a value other than `self`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mReturning anything other than `self` from a top level model validator isn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt supported when validating via `__init__`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSee the `model_validator` docs (https://docs.pydantic.dev/latest/concepts/validators/#model-validators) for more details.\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 218\u001b[0m category\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 219\u001b[0m )\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for VectorstoreIndexCreator\nembedding\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.9/v/missing" + ] + } + ], "source": [ "from langchain_community.document_loaders import TextLoader\n", "from langchain.indexes import VectorstoreIndexCreator\n", @@ -495,7 +516,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/howtos/integrations/langgraph_agent_evaluation.ipynb b/docs/howtos/integrations/langgraph_agent_evaluation.ipynb index 3f2b59698..a719c8511 100644 --- a/docs/howtos/integrations/langgraph_agent_evaluation.ipynb +++ b/docs/howtos/integrations/langgraph_agent_evaluation.ipynb @@ -1,783 +1,783 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "t1ub1OLYZQvz" - }, - "source": [ - "# Building and Evaluating a ReAct Agent for Fetching Metal Prices\n", - "\n", - "AI agents are becoming increasingly valuable in domains like finance, e-commerce, and customer support. These agents can autonomously interact with APIs, retrieve real-time data, and perform tasks that align with user goals. Evaluating these agents is crucial to ensure they are effective, accurate, and responsive to different inputs.\n", - "\n", - "In this tutorial, we'll:\n", - "\n", - "1. Build a [ReAct agent](https://arxiv.org/abs/2210.03629) to fetch metal prices.\n", - "2. Set up an evaluation pipeline to track key performance metrics.\n", - "3. Run and assess the agent's effectiveness with different queries.\n", - "\n", - "Click the [link](https://colab.research.google.com/github/explodinggradients/ragas/blob/main/docs/howtos/integrations/langgraph_agent_evaluation.ipynb) to open the notebook in Google Colab." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "- Python 3.8+\n", - "- Basic understanding of LangGraph, LangChain and LLMs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Q8Ms4ngAZQv1" - }, - "source": [ - "## Installing Ragas and Other Dependencies\n", - "Install Ragas and Langgraph with pip:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "id": "vQk4aWbpZQv1", - "outputId": "4af0ac60-3d1a-4e41-de6e-d33f74921845" - }, - "outputs": [], - "source": [ - "%pip install langgraph==0.2.44\n", - "%pip install ragas\n", - "%pip install nltk" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eJJ-WKWMZQv2" - }, - "source": [ - "## Building the ReAct Agent" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lAXAIbo7ZQv2" - }, - "source": [ - "### Initializing External Components\n", - "To begin, you have two options for setting up the external components:\n", - "\n", - "1. Use a Live API Key: \n", - "\n", - " - Sign up for an account on [metals.dev](https://metals.dev/) to get your API key. \n", - " \n", - "2. Simulate the API Response: \n", - "\n", - " - Alternatively, you can use a predefined JSON object to simulate the API response. This allows you to get started more quickly without needing a live API key. \n", - "\n", - "\n", - "Choose the method that best fits your needs to proceed with the setup." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PNZijyBXZQv3" - }, - "source": [ - "### Predefined JSON Object to simulate API response\n", - "If you would like to quickly get started without creating an account, you can bypass the setup process and use the predefined JSON object given below that simulates the API response." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "puMC36BPZQv3" - }, - "outputs": [], - "source": [ - "metal_price = {\n", - " \"gold\": 88.1553,\n", - " \"silver\": 1.0523,\n", - " \"platinum\": 32.169,\n", - " \"palladium\": 35.8252,\n", - " \"lbma_gold_am\": 88.3294,\n", - " \"lbma_gold_pm\": 88.2313,\n", - " \"lbma_silver\": 1.0545,\n", - " \"lbma_platinum_am\": 31.99,\n", - " \"lbma_platinum_pm\": 32.2793,\n", - " \"lbma_palladium_am\": 36.0088,\n", - " \"lbma_palladium_pm\": 36.2017,\n", - " \"mcx_gold\": 93.2689,\n", - " \"mcx_gold_am\": 94.281,\n", - " \"mcx_gold_pm\": 94.1764,\n", - " \"mcx_silver\": 1.125,\n", - " \"mcx_silver_am\": 1.1501,\n", - " \"mcx_silver_pm\": 1.1483,\n", - " \"ibja_gold\": 93.2713,\n", - " \"copper\": 0.0098,\n", - " \"aluminum\": 0.0026,\n", - " \"lead\": 0.0021,\n", - " \"nickel\": 0.0159,\n", - " \"zinc\": 0.0031,\n", - " \"lme_copper\": 0.0096,\n", - " \"lme_aluminum\": 0.0026,\n", - " \"lme_lead\": 0.002,\n", - " \"lme_nickel\": 0.0158,\n", - " \"lme_zinc\": 0.0031,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2SduQYJbZQv3" - }, - "source": [ - "### Define the get_metal_price Tool\n", - "\n", - "The get_metal_price tool will be used by the agent to fetch the price of a specified metal. We'll create this tool using the @tool decorator from LangChain.\n", - "\n", - "If you want to use real-time data from the metals.dev API, you can modify the function to make a live request to the API." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "1X2TsFLfZQv3" - }, - "outputs": [], - "source": [ - "from langchain_core.tools import tool\n", - "\n", - "\n", - "# Define the tools for the agent to use\n", - "@tool\n", - "def get_metal_price(metal_name: str) -> float:\n", - " \"\"\"Fetches the current per gram price of the specified metal.\n", - "\n", - " Args:\n", - " metal_name : The name of the metal (e.g., 'gold', 'silver', 'platinum').\n", - "\n", - " Returns:\n", - " float: The current price of the metal in dollars per gram.\n", - "\n", - " Raises:\n", - " KeyError: If the specified metal is not found in the data source.\n", - " \"\"\"\n", - " try:\n", - " metal_name = metal_name.lower().strip()\n", - " if metal_name not in metal_price:\n", - " raise KeyError(\n", - " f\"Metal '{metal_name}' not found. Available metals: {', '.join(metal_price['metals'].keys())}\"\n", - " )\n", - " return metal_price[metal_name]\n", - " except Exception as e:\n", - " raise Exception(f\"Error fetching metal price: {str(e)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "j85XikcLZQv4" - }, - "source": [ - "### Binding the Tool to the LLM\n", - "With the get_metal_price tool defined, the next step is to bind it to the ChatOpenAI model. This enables the agent to invoke the tool during its execution based on the user's requests allowing it to interact with external data and perform actions beyond its native capabilities." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "lsxVT0lUZQv4" - }, - "outputs": [], - "source": [ - "from langchain_openai import ChatOpenAI\n", - "\n", - "tools = [get_metal_price]\n", - "llm = ChatOpenAI(model=\"gpt-4o-mini\")\n", - "llm_with_tools = llm.bind_tools(tools)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yuDuSrmQZQv4" - }, - "source": [ - "In LangGraph, state plays a crucial role in tracking and updating information as the graph executes. As different parts of the graph run, the state evolves to reflect the changes and contains information that is passed between nodes.\n", - "\n", - "For example, in a conversational system like this one, the state is used to track the exchanged messages. Each time a new message is generated, it is added to the state and the updated state is passed through the nodes, ensuring the conversation progresses logically.\n", - "\n", - "### Defining the State\n", - "To implement this in LangGraph, we define a state class that maintains a list of messages. Whenever a new message is produced it gets appended to this list, ensuring that the conversation history is continuously updated." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "JHHXxYT1ZQv4" - }, - "outputs": [], - "source": [ - "from langgraph.graph import END\n", - "from langchain_core.messages import AnyMessage\n", - "from langgraph.graph.message import add_messages\n", - "from typing import Annotated\n", - "from typing_extensions import TypedDict\n", - "\n", - "\n", - "class GraphState(TypedDict):\n", - " messages: Annotated[list[AnyMessage], add_messages]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1KGbjrAOZQv4" - }, - "source": [ - "### Defining the should_continue Function\n", - "The `should_continue` function determines whether the conversation should proceed with further tool interactions or end. Specifically, it checks if the last message contains any tool calls (e.g., a request for metal prices).\n", - "\n", - "- If the last message includes tool calls, indicating that the agent has invoked an external tool, the conversation continues and moves to the \"tools\" node.\n", - "- If there are no tool calls, the conversation ends, represented by the END state." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "KjppKPRDZQv4" - }, - "outputs": [], - "source": [ - "# Define the function that determines whether to continue or not\n", - "def should_continue(state: GraphState):\n", - " messages = state[\"messages\"]\n", - " last_message = messages[-1]\n", - " if last_message.tool_calls:\n", - " return \"tools\"\n", - " return END" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZbyJRNRvZQv4" - }, - "source": [ - "### Calling the Model\n", - "The `call_model` function interacts with the Language Model (LLM) to generate a response based on the current state of the conversation. It takes the updated state as input, processes it and returns a model-generated response." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "ZYflc7eZZQv4" - }, - "outputs": [], - "source": [ - "# Define the function that calls the model\n", - "def call_model(state: GraphState):\n", - " messages = state[\"messages\"]\n", - " response = llm_with_tools.invoke(messages)\n", - " return {\"messages\": [response]}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VzxIHVa2ZQv4" - }, - "source": [ - "### Creating the Assistant Node\n", - "The `assistant` node is a key component responsible for processing the current state of the conversation and using the Language Model (LLM) to generate a relevant response. It evaluates the state, determines the appropriate course of action, and invokes the LLM to produce a response that aligns with the ongoing dialogue." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "_fPD6W2SZQv4" - }, - "outputs": [], - "source": [ - "# Node\n", - "def assistant(state: GraphState):\n", - " response = llm_with_tools.invoke(state[\"messages\"])\n", - " return {\"messages\": [response]}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Vc3No3agZQv5" - }, - "source": [ - "### Creating the Tool Node\n", - "The `tool_node` is responsible for managing interactions with external tools, such as fetching metal prices or performing other actions beyond the LLM's native capabilities. The tools themselves are defined earlier in the code, and the tool_node invokes these tools based on the current state and the needs of the conversation." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "vz2qlceBZQv5" - }, - "outputs": [], - "source": [ - "from langgraph.prebuilt import ToolNode\n", - "\n", - "# Node\n", - "tools = [get_metal_price]\n", - "tool_node = ToolNode(tools)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "M2FWZfGFZQv5" - }, - "source": [ - "### Building the Graph\n", - "The graph structure is the backbone of the agentic workflow, consisting of interconnected nodes and edges. To construct this graph, we use the StateGraph builder which allows us to define and connect various nodes. Each node represents a step in the process (e.g., the assistant node, tool node) and the edges dictate the flow of execution between these steps." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 266 - }, - "id": "FeGI8G3KZQv5", - "outputId": "4575b3ed-e162-4419-f44f-ff0086aaf546" - }, - "outputs": [ - { - "data": { - "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/4gHYSUNDX1BST0ZJTEUAAQEAAAHIAAAAAAQwAABtbnRyUkdCIFhZWiAH4AABAAEAAAAAAABhY3NwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAA9tYAAQAAAADTLQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlkZXNjAAAA8AAAACRyWFlaAAABFAAAABRnWFlaAAABKAAAABRiWFlaAAABPAAAABR3dHB0AAABUAAAABRyVFJDAAABZAAAAChnVFJDAAABZAAAAChiVFJDAAABZAAAAChjcHJ0AAABjAAAADxtbHVjAAAAAAAAAAEAAAAMZW5VUwAAAAgAAAAcAHMAUgBHAEJYWVogAAAAAAAAb6IAADj1AAADkFhZWiAAAAAAAABimQAAt4UAABjaWFlaIAAAAAAAACSgAAAPhAAAts9YWVogAAAAAAAA9tYAAQAAAADTLXBhcmEAAAAAAAQAAAACZmYAAPKnAAANWQAAE9AAAApbAAAAAAAAAABtbHVjAAAAAAAAAAEAAAAMZW5VUwAAACAAAAAcAEcAbwBvAGcAbABlACAASQBuAGMALgAgADIAMAAxADb/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAD5ANYDASIAAhEBAxEB/8QAHQABAAIDAQEBAQAAAAAAAAAAAAUGAwQHCAECCf/EAFEQAAEEAQIDAgYLDAcGBwAAAAEAAgMEBQYRBxIhEzEVFiJBUZQIFBcyVVZhdNHS0yM1NlRxdYGRk5WytCU3QkNSgpIYJGRylqEzNFNiscHw/8QAGwEBAQADAQEBAAAAAAAAAAAAAAECAwUEBgf/xAAzEQEAAQIBCQUJAQADAAAAAAAAAQIRAwQSITFBUVKR0RQzYXGhBRMVI2KSscHhgSLw8f/aAAwDAQACEQMRAD8A/qmiIgIiICIiAsNq5XpR89ieOuz/ABSvDR+sqDu37uevz47FTGlVrnkt5NrQ5zX/APpQhwLS4d7nuBa3cNAc4u5Ptbh/p+F5llxcF+ydua1fb7ZmcR5y9+5/V0W+KKae8n/IW29u+NWF+F6HrLPpTxqwvwxQ9ZZ9KeKuF+B6HqzPoTxVwvwPQ9WZ9CvyfH0XQeNWF+GKHrLPpTxqwvwxQ9ZZ9KeKuF+B6HqzPoTxVwvwPQ9WZ9CfJ8fQ0HjVhfhih6yz6U8asL8MUPWWfSnirhfgeh6sz6E8VcL8D0PVmfQnyfH0NB41YX4Yoess+lblTIVb7S6rZhstHeYZA4D9S0/FXC/A9D1Zn0LUtaB05bkErsNThnad22K0QhmafkkZs4foKfJnbPp/E0J9FWI7NzSM8MN+1NksPK4RsvT8va1XE7NbKQAHMPQB+24O3NvuXCzrXXRm+MEwIiLWgiIgIiICIiAiIgIiICIiAojV2Yfp/S+VyMQDpq1Z8kTXdxft5IP6dlLqvcQqct7ROZjhaZJm13SsY0blzmeWAB6SW7LbgxE4lMVarwsa0hp/Dx4DDVKEZ5uxZ5cnnkkJ3e8/K5xc4n0kqRWGnaivVILMDueGZjZGO9LSNwf1FZlhVMzVM1a0FUuIHFbS3C6LHv1JkzSfkJHRVIIa01madzW8z+SKFj3kNHUnbYbjchW1cU9krQqPg07k48frBupMc+zJiM5o7HG7NQldG0OZNEA4Ojl6Atc0tPL1LehWI2cp7JjT+N4q6b0m2tetUc3hfC8OTq463ODzyQthaGxwu8lzZHOdISAzZodylwVgtcftBUdct0hZz3tfOvtNotilpzthNhw3bCJzH2XaHcbN59zuBsuUx5fWendd8Ltfax0nlrtuxpGzicxDp6g+4+neklrTDnij3LWu7J43G4aehPnVA4t4/Wep5tTDMYbX+W1Bj9VwW8fUxsEwwsOJguRSRyRtjIjsSGJpJGz5ec9GgDoHpi3x20TT1je0ocpYsahozR17VCnjbVh8DpI2yMLzHE4NYWvb5ZPLuSN9wQIvgLx7xvHPBWblWjdx1yvYsxyV56VlkYjZYkijc2aSJjHuc1gc5jSSwktcAQtbhLp+7jOMXGnJWsbYqQZLLY91W3NA5jbUbMdA0ljiNnta/nb03APMO/dRfsY7GQ0vh8poTMaezWNyWLymUte3rFF7aFmGW9JLG6GxtyPLmzNPKDuOV24GyDuCIiDXyFCvlaFmlbibPVsxuhlif3PY4bOB/KCVEaGvz39Nwi1L29upLNRmlO+8j4ZXRF53/wAXJzfpU+qzw8b2mn5Lg35L921cj5htvHJO90Z2+VnKf0r0U9zVffH7XYsyIi86CIiAiIgIiICIiAiIgIiICIiCqU52aDeaNvaLAOeXU7fXkqbncwynuY3cnkf0btsw7EN7THqvhFobX+RjyWo9JYTP3mxCFlrIUYp5BGCSGhzgTy7ucdvlKtr2NkY5j2h7HDYtcNwR6Cq0/h9joSTjbOQwoP8AdY62+OIejaI7xt/Q0f8AYL0TVRiaa5tPO/8A3/WWiVePsbeFBaG+5vpblBJA8EwbA+f+z8gVm0fw70tw9hsxaY09jNPxWXNdOzG1GQCUjcAuDQN9tz3+lYfEmx8as9+2h+yTxJsfGrPftofsk93h8fpKWjetCKr+JNj41Z79tD9kqnex2Wr8VcHp5mqcx4OuYW/flJlh7TtYZ6bGbfc/e8tiTfp38vUed7vD4/SS0b3VFC6s0XgNd4xuO1HhaGdx7ZBM2rka7Z4w8AgO5XAjcBxG/wApWj4k2PjVnv20P2SeJNj41Z79tD9knu8Pj9JLRvQDfY3cKWBwbw40u0PGzgMTB1G4Ox8n0gfqUnpngroDRmXiyuA0XgcNk4g5sdyjj4oZWhw2cA5rQRuCQVueJNj41Z79tD9kvviBTsO/pDIZXKs337G1deIj+VjOVrh8jgQmZhxrr5R/4Wh+crkPG7t8Nipeeo/mhyGRhd5ELOodFG4d8p7unvBu4kHla6ywQR1oI4YWNiijaGMYwbBrQNgAPMF8q1YaVeOvXhjrwRtDWRRNDWtA7gAOgCyrCuuJjNp1QSIiLUgiIgIiICIiAiIgIiICIiAiIgIiICIiAufZYt937SwJPN4sZfYebb21jd/P+TzfpHn6Cuf5Xf3ftLdW7eLGX6EDf/zWN7vPt+Tp3b+ZB0BERAREQEREBERAREQEREBERAREQEREBERAREQEREBERAXPcsB/tA6VPM0HxXzHk7dT/veM677d36fOP0dCXPctt/tBaV6nm8V8xsOX/i8Z5/8A9/2QdCREQEREBERAREQEREBERAREQEREBERAREQERaeXy1fB46a7aLhDEBuGNLnOJIDWtA7ySQAPOSFYiaptGsbiKlP1Dquby4cVia7HdRHYuyOkaP8A3cse2/pAJHylfnw7rD8Qwfrc32a9fZa98c4Wy7oqR4d1h+IYP1ub7NPDusPxDB+tzfZp2WvfHOCy7rwHrH2e2V097IivibXCud2ocTHc06MfFmA7t5Z7FZzXsd7X35T7XG2w8oPB8wXsXw7rD8Qwfrc32a5BnvY/zah9kHh+LVjH4YZnHVexNQWJDFPM0csU7j2e/Oxp2H/Kz/D1dlr3xzgs9LIqR4d1h+IYP1ub7NPDusPxDB+tzfZp2WvfHOCy7oqR4d1h+IYP1ub7NPDusPxDB+tzfZp2WvfHOCy7oqUzPaua7d+NwsjR3tbdmaT+nsjt+pWPAZyHP0PbEbHwSMeYpq8u3PDI33zHbdOnpG4IIIJBBWqvArw4zp1eE3LJJERaEEREBERAREQEREBERAREQFUuJh2wVEeY5ahuD85jVtVR4m/eKh+dqH8zGvTk3f0ecMqdcNtERepiIiICKJy2qsXgsthsbesmG7mJn16MXZvd2r2RukcNwCG7Ma47uIHTbv6KRt24KFWazZmjr1oWOklmlcGsY0DcucT0AAG5JUGVFr43I1cxjqt+lPHapWomTwTxO5mSRuAc1zT5wQQR+VbCoItXKZWng8bayORtQ0aFWJ009mw8MjijaN3Oc49AAASSVmrzx2oI5oXiSKRoex7e5zSNwQgyLR0Af6V1kPMMszYAf8DVK3lo6A++2s/zvH/I1VZ7uvy/cMo1SuKIi5bEREQEREBERAREQEREBERAVR4m/eKh+dqH8zGrcqjxN+8VD87UP5mNenJu/o84ZU64bapHGvU1PSPDDOZG7NlIIuSOux2EkbHddLLI2KJsTndGuc97W8x6DffzK7qK1TpbFa10/dwecpR5HFXGdnPWl32eNwR1BBBBAIIIIIBBBC9M6mLzLpStxPZkOJvD+nl7uIzEunamSw5y2ddl5aU0kk0bh7adG1zecRjps4MPVpO6zxRal1Nw/vYHS1rWdfUWAz1eTUun8rqD+k3V3QbmCpf3I5H7tla7mbzbOG7AQF2Cn7HXh9RZkBFgXOfkaRx92aW/ZkltQl7X8skjpC55BY3lc4lzQNmkAkL432OfD5mAfhm4KVtR91uQfK3I2hadYawxtkNjte1JDCWjd/QEha82RzHEanhzWq+BGS03qLU8mOyFrK421WzN6UvkMNS04stRc3LJJHKzbmIJ8huzj0Kr+GqZbFaV17ozXuX1W/XE+mb1508uZfNjclCwnexU5SDAQSxrotmbNdts4EleiMZwk0jhYdMQ0MNHUi00+aTFMhlkaK75Y3xyu995Zc2R+5fzHdxPf1WnovgZofh9dtW8HgmVrFisab3z2JrPLXJ5jCwSvcGRk7Esbs07Dp0VzZHG8fVq6V9jvw0wuOvatv5fVUdD2lXx+oJYZnymkJHsFmQuNes1jHOLY9tthyjqVWqWqtaxcOsjp/IagymPyWN4lY7AMuw5Q27UVSZ9ZzojZdG0zbdu8cz2dRsCDsu8wexv4eVdPHBw4KWPGCyy5FE3JWg6tKwODHQP7Xmg2D3DaMtGziNtlt47gHoLEV5IKWAbWhkyFTKvjjtThr7dYh0M5HP1eCAXE+/I8vmUzZHCeKWPt4vTXsgdFSZ3OZLCUtKVszT9v5OaeeCR7LPaR9s5xe6JxgYSxxLdi4bbOIXonhXputpfQmIq1bmQvRSV45+1yV+W5Ju5jTsHyucQ30NB2HmC3Z9AaftZjN5SfGxz3M1RjxuQdK5z2WKzO05Y3MJ5dvusm+wBPN136L8aD4eYHhnhXYnTtSaljzJ2vYy25rHKeVrdmmV7i1oa1oDQQBt0CyiLSLGtHQH321n+d4/5Gqt5aOgPvtrP87x/yNVbJ7uvy/cMo1SuKIi5bEREQEREBERAREQEREBERAVR4m/eKh+dqH8zGrcorU2D8YcPLTbN7WmD45oZuXm7OWN4ewkbjcczRuNxuNxuN1vwKooxaaqtUTCxoloooZ9/UVfyJdJ2rEg6OfSuVnRH5WmSRjtvytB+RanjPmDfbTbo3LvmLXOcWTVHMZy8m4e8TcrXESNIaSCRuQCGkjoZn1R90dSyyIoTwtnviZlfWqX26eFs98TMr61S+3TM+qPujqtk2ihPC2e+JmV9apfbqr3eMdbH8Qsfoexg78WqshUfdrY4z1eaSFm/M7m7blHc47E7kNJA2BTM+qPujqWdDRQnhbPfEzK+tUvt08LZ74mZX1ql9umZ9UfdHUsm0UJ4Wz3xMyvrVL7dPC2e+JmV9apfbpmfVH3R1LJtaOgPvtrP87x/yNVRGP1RlcpI+GHSmRgsNBJiuWK0TmgPczmLe1Lw0ljtncpDgNwSCFbdKYObC0rDrcrJb92c2rJi37Nry1rQ1m/Xla1jW7nbfbfYb7DXiTFGHVEzGnRomJ2xOzyNUJtERcxiIiICIiAiIgIiICIiAiIgIvjnBjS5xDWgbknuCgY32NT2GyRyTUsRBOfeiNzcpGYuhDtyWxczz3crnOiBB7M/dA/M+Qs6lE1bEyy06ZjhlZnIuykilBk8uOEbkl3I07vLeUdowt5yHBstjcVTw8MkNGrFUikmksPbEwNDpJHl8jzt3uc5xJPnJKzVq0NKtFXrxMggiYI44omhrWNA2DQB0AA6bLKgIiIC/njxB9jLxuz3suqmsq2otK1c/OZszi43XbRigqVJYIhA8iv5xYjBABB3fufT/Q5c/wAhyzcfMByhpdX0zkec7nmaJLVHl6d2x7J3+n8qDoCIiAiIgis3p2vmWPla99DJivJWr5WqyP21Va8tLuzc9rhtzMjcWuBa4sbzNcBstV+opcRekhzcUNKpLahq0L0cjntsukb0bIOUdi/nBYASWu5o9ncz+Rs+iAirIqy6Jqh1NktrT9WCxNNWHbWrjHc3aNEI3c57QC9oiAJADGsGwDVYoJ47MLJoniSJ7Q5rm9xB7igyIiICIiAiIgIiICIiAiLFan9q1ppuR8vZsL+SMbudsN9gPOUEBZEOsr1zHu5J8JUdJTyVK5j+eO690bHBjXv8l0bQ883K1wL9m8wMcjDZFA6Dj5NF4R3a5SYyVI5i/Nn/AH3d7Q4iYDoHjm2LR0BGw6AKeQEREBERAXPuHBOq9Q6g1xvzUciIsdiHb7h9GAvInHXbaWWWZwI99G2E+jb96ltS8QsrY0pjJnR4iu8Mz+Qhc5ruXYO9pROHdI8Edo4Hdkbths+RrmXqvXiqQRwQRshhiaGMjjaGtY0DYAAdwA8yDIiIgIiICIiAoG7RfgbdrK0Ws7CeT2xkoXNlke8Nj5eeJrOby+VrByhp5+UDoepnkQa2OyNXMY+rfo2I7dK1E2eCxC4OZLG4BzXNI6EEEEH5Vsqv4WWSjqTMYuR+UtMcGZGGzbiBrxtlLmmvFKO8sdEXlrurRMzYkbBtgQEREBERAREQERQuY1tp7T9oVsnnMdj7JHN2Nm0xj9vTyk77LOmiqubUxeVtdNIqt7qWjvjTiPXY/pVZ4l3+G3FfQmZ0ln9R4qbFZSDsZQy/G17SCHMe07++a9rXDfpu0bgjotvZ8bgnlK5s7kjoXiBpeGWpow6k31NSdLSGKzuQidmJxCXDtnx83O8PjYJWv28qNzXnvKvy/nF7CngvR4K+yJ1ff1Hm8XJj8PTNbE5T2ywRXDM4fdIzvtuI2uDh3tL9j8vvT3UtHfGnEeux/SnZ8bgnlJmzuWlFVvdS0d8acR67H9Ke6lo7404j12P6U7PjcE8pM2dy0qm57O5DUGXk05puXsJIi0ZXM8vM3HsI37KLccr7Lm9zTuImuEjwd445ojJcRqus86zS+ls5UgfLHz28vFPG50LCPeVmu3Esx9OxZGOrtzysdesHg6Gm8XDjsbWbVpw8xbG0kkuc4ue9zjuXOc5znOc4lznOJJJJK1VUVUTauLJaz5gcDQ0xiK2MxlcVqVcEMZzFxJJLnOc5xLnvc4lznuJc5ziSSSSpBEWCCIiAiIgIiICIiCu2yG8Q8UN8yS/F3OkX3tHLNW/8b0Tnm+5+lgn9CsS45k/ZFcKq/EbFQy8T8LE9mNvtfEzO1Bjw4TVBtP8AdOk469mP8Ptj0LsaAiIgIiICIiDSzVx2Pw960wAvggklaD6WtJH/AMKo6SqR1sBSkA5p7MTJ55ndXzSOaC57iepJJ/R3dwVn1V+DGY+ZzfwFV7TX4OYr5pF/AF0MDRhT5rsSSIizQREQEREGrksbWy1OStajEkT/AJdi0jqHNI6tcDsQ4dQQCOq39B5SfNaLwd60/tbM9OJ8sm23O7lG7tvNueu3yrEsPCz+rnTnzGL+FY4unBnwmPxPRdi0oiLnIIiICIq3rrWcGisQLDoxZuTv7KrV5uXtX95JPma0bkn0DYbkgHZh4dWLXFFEXmRM5PLUcJUdbyNyvQqt99PalbGwflc4gKsS8YdHQvLTnIXEdN445Hj9YaQuH5O1azuR8IZWw6/e68skg8mIb+9jb3Mb0HQdTsCST1WNfW4XsPDin5tc38P7cvDuPuzaN+Gm+ry/UT3ZtG/DTfV5fqLhyLd8Dybiq5x0Lw4FxI9jppPVPsxsdqSvcjPD3JSeGMq4RSBsdhh3fBy7c33V/Keg2Ae70L3d7s2jfhpvq8v1Fw5E+B5NxVc46F4dx92bRvw031eX6i+s4yaNe7bw3G35XwyNH6y1cNRPgeTcVXOOheHpbD6gxmoa7p8XkKuQiaeVzq0rZA0+g7HofkKkF5YgMlK9HepTyUb8fvLVchr2/IehDh0HkuBB26gruvDfXw1jSmr22sgy9MNE8bPeytPdKweZpIII72kEdRsTxcu9l1ZLT7yib0+sLr1LkiIuEiL1V+DGY+ZzfwFV7TX4OYr5pF/AFYdVfgxmPmc38BVe01+DmK+aRfwBdHB7mfP9Lsb1h0jIJHQsbLMGksY53KHO26AnY7dfPsV524W8etUYzgrmNZ68xUVivUvW4Ks2Puiazdn8ISV46wh7GNrNnckbXcx5gOYhvVejV57h4Baul0DqXQU+RwsWAdfmy+By0Jldchsm8LkTZ4i0M5WvLmkteSRt0Ck32IsDfZCT6WtZmpxD0wdIWqGFlz8XtXINyEdmtE4Nla14YzaVrnMHJtsecbOIWCvxvzs9iriNT6Om0dNqDF27WEsx5Ntpz3xQ9q6KUNY0wyhh5wAXDyXeVuFG5ngRqji5kM3e4i3MNRdPp2xp+hU086WaOHt3NdJZe+VrCXbxx7MA2AB3J71u47hRrrV+qtNZHX9/BMqaap2oajMCZnvuWJ4DXdPL2jWiMCMv2Y3m6vPldAp/yEHpLjjmNNcMOC2MixbtV6o1XhGTNnyuWFRkj4oInSc072vL5XmQbN2Jds4kjZehMfNPZoVprNY07MkTXy1y8P7J5AJZzDodjuNx0Oy8/WOC2vncEMDw9sUdC6ir4+pJjpJMr7ZaOzY1rKtiPlY4smaA4uA8+3K8Ltmg9P29KaJwGFv5KTMXsdQgqT5CbfnsvZGGukO5J3cQT1JPXqSrTfaJ1YeFn9XOnPmMX8KzLDws/q5058xi/hVxe5nzj8SuxaURFzkEREBcC4s5J2S4iWIHOJixtWOCNp7muk+6PI/KOyB/5Au+rgXFnGuxnEOedzSIsnVjnjee5z4/ubwPyDsj/nC73sXN7Vp12m3p+rrslVkWvkb8WLoz25xKYYWF7xDC+V+w9DGAucfkAJVVHFvT5/us5/07kPsF9vViUUaKpiGtcnODWkkgAdST5lxOl7KDD3chUeyDHnCW7bKkU7M1A695T+RsjqY8sMLiD74uDTuWhXtnFHT997avY5o9uez2fp++xp36dXGAADr3k7KvcPtCau0HFj9Ptfp+9pmhI5sV6Zsovur7ktYWAcnMNwOfm7h73deTErrrqp9zVo22tO637Vin43X68OUyUmli3T2LzMmHuX/CDe0aW2BCJWRcnlN3c0kFzSNyBzAbnX4mcUMxNh9c0dL4Sa5BhaM8V3NNvisas5gL9oRsS98bXNcdi3Y9Ad1nyPCbL2+HWsMAyzSFzMZ2bJ13ue/s2xPtsmAeeTcO5WkbAEb+fzrBqHhprCv484/TlnCyYTVQmmkGTdMyarYlgEUhbyNIe13K09dtj6fPoqnKM2030x4X2/wdH0XPLa0dgpppHzTSUIHvkkcXOc4xtJJJ7yT51MKi4/W+K0bjKGDvtykl3H1oa0zqeFvTxFzY2glsjIS1w+UFZ/dd08f7rO/9O5D7Be2nFw4iImqL+aLmpbRWSdh9e4CyxxaJpzSlA/tslaQB/rEbv8qreFzVbP46O7UFhsDyQBarS15Oh2O7JGtcO7zjqrJonGuzOvcBWY3mbBObspH9hkbSQf8AWYx/mUyiaJwK5q1Wn8Mqdb0giIvzBUXqr8GMx8zm/gKr2mvwcxXzSL+AK05mm7I4i9UYQHzwSRAnzFzSP/tVDSVyOxgacIPJZrQsgsQO6Phka0BzHA9QQf1jYjoQuhgacKY8V2JhERZoIiICIiAsPCz+rnTnzGL+FY8nlK2IqPs2pRHG3oB3ue49A1rR1c4kgBo3JJAHUqQ0Ji58JozCUbTOzswU4mSx778j+Ubt38+x6b/IscXRgz4zH4nquxOoiLnIIiICrmudGQa1w4rPkFa3C/tatrl5jE/u6jpu0jcEb9x6EEAixotmHiVYVcV0TaYHl3K1LWn8h7Qy1c4+515WvO7JR/ijf3PHd3dRuNw09FjXpzJYulmaj6t+pBerP99DZibIw/laQQqxLwg0dK4uOBrtJ67RuewfqBAX1uF7cw5p+bRN/D+locKRdy9xvRvwHF+1k+snuN6N+A4v2sn1lu+OZNw1co6locNRdy9xvRvwHF+1k+snuN6N+A4v2sn1k+OZNw1co6locNRdy9xvRvwHF+1k+svrODujWO38BQO+R73uH6i7ZPjmTcNXKOpaN7hdYS5C8yjRgkv33+9q1wHPPynrs0dR5TiAN+pXduHGgho2jNPaeyfL2+UzyM95G0e9iYe8tBJO56uJJ2A2a2xYjBY3AVzBjKFbHwk7llaJsYcfSdh1Pylb64mXe1Ksrp93RFqfWV1ahERcNBQuY0Vp/UNgWMpg8bkZwOUS2qkcjwPRu4E7KaRZU11UTembSalW9yvRnxTwn7vi+qnuV6M+KeE/d8X1VaUW7tGNxzzlbzvVb3K9GfFPCfu+L6qe5Xoz4p4T93xfVVpRO0Y3HPOS871W9yvRnxTwn7vi+qnuV6M+KeE/d8X1VaUTtGNxzzkvO9B4rQ2nMFZbZx2AxlCw3flmrVI43t379iBuN1OIi1VV1VzeqbprERFgCIiAiIgIiICIiAiIgIiICIiAiIg//9k=", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langgraph.graph import START, StateGraph\n", - "from IPython.display import Image, display\n", - "\n", - "# Define a new graph for the agent\n", - "builder = StateGraph(GraphState)\n", - "\n", - "# Define the two nodes we will cycle between\n", - "builder.add_node(\"assistant\", assistant)\n", - "builder.add_node(\"tools\", tool_node)\n", - "\n", - "# Set the entrypoint as `agent`\n", - "builder.add_edge(START, \"assistant\")\n", - "\n", - "# Making a conditional edge\n", - "# should_continue will determine which node is called next.\n", - "builder.add_conditional_edges(\"assistant\", should_continue, [\"tools\", END])\n", - "\n", - "# Making a normal edge from `tools` to `agent`.\n", - "# The `agent` node will be called after the `tool`.\n", - "builder.add_edge(\"tools\", \"assistant\")\n", - "\n", - "# Compile and display the graph for a visual overview\n", - "react_graph = builder.compile()\n", - "display(Image(react_graph.get_graph(xray=True).draw_mermaid_png()))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wlNB4fI4ZQv5" - }, - "source": [ - "To test our setup, we will run the agent with a query. The agent will fetch the price of copper using the metals.dev API." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "rzt0I-n2ZQv5" - }, - "outputs": [], - "source": [ - "from langchain_core.messages import HumanMessage\n", - "\n", - "messages = [HumanMessage(content=\"What is the price of copper?\")]\n", - "result = react_graph.invoke({\"messages\": messages})" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "esoHsop8ZQv5", - "outputId": "0d52f2db-f2da-4f5a-943e-e549b731f01e" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[HumanMessage(content='What is the price of copper?', id='4122f5d4-e298-49e8-a0e0-c98adda78c6c'),\n", - " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_DkVQBK4UMgiXrpguUS2qC4mA', 'function': {'arguments': '{\"metal_name\":\"copper\"}', 'name': 'get_metal_price'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 116, 'total_tokens': 134, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-0f77b156-e43e-4c1e-bd3a-307333eefb68-0', tool_calls=[{'name': 'get_metal_price', 'args': {'metal_name': 'copper'}, 'id': 'call_DkVQBK4UMgiXrpguUS2qC4mA', 'type': 'tool_call'}], usage_metadata={'input_tokens': 116, 'output_tokens': 18, 'total_tokens': 134}),\n", - " ToolMessage(content='0.0098', name='get_metal_price', id='422c089a-6b76-4e48-952f-8925c3700ae3', tool_call_id='call_DkVQBK4UMgiXrpguUS2qC4mA'),\n", - " AIMessage(content='The price of copper is $0.0098 per gram.', response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 148, 'total_tokens': 162, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'stop', 'logprobs': None}, id='run-67cbf98b-4fa6-431e-9ce4-58697a76c36e-0', usage_metadata={'input_tokens': 148, 'output_tokens': 14, 'total_tokens': 162})]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result[\"messages\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wsK_VEDSZQv6" - }, - "source": [ - "### Converting Messages to Ragas Evaluation Format\n", - "\n", - "In the current implementation, the GraphState stores messages exchanged between the human user, the AI (LLM's responses), and any external tools (APIs or services the AI uses) in a list. Each message is an object in LangChain's format\n", - "\n", - "```python\n", - "# Implementation of Graph State\n", - "class GraphState(TypedDict):\n", - " messages: Annotated[list[AnyMessage], add_messages]\n", - "```\n", - "\n", - "Each time a message is exchanged during agent execution, it gets added to the messages list in the GraphState. However, Ragas requires a specific message format for evaluating interactions.\n", - "\n", - "Ragas uses its own format to evaluate agent interactions. So, if you're using LangGraph, you will need to convert the LangChain message objects into Ragas message objects. This allows you to evaluate your AI agents with Ragasโ€™ built-in evaluation tools.\n", - "\n", - "**Goal:** Convert the list of LangChain messages (e.g., HumanMessage, AIMessage, and ToolMessage) into the format expected by Ragas, so the evaluation framework can understand and process them properly." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To convert a list of LangChain messages into a format suitable for Ragas evaluation, Ragas provides the function [convert_to_ragas_messages][ragas.integrations.langgraph.convert_to_ragas_messages], which can be used to transform LangChain messages into the format expected by Ragas.\n", - "\n", - "Here's how you can use the function:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from ragas.integrations.langgraph import convert_to_ragas_messages\n", - "\n", - "# Assuming 'result[\"messages\"]' contains the list of LangChain messages\n", - "ragas_trace = convert_to_ragas_messages(result[\"messages\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[HumanMessage(content='What is the price of copper?', metadata=None, type='human'),\n", - " AIMessage(content='', metadata=None, type='ai', tool_calls=[ToolCall(name='get_metal_price', args={'metal_name': 'copper'})]),\n", - " ToolMessage(content='0.0098', metadata=None, type='tool'),\n", - " AIMessage(content='The price of copper is $0.0098 per gram.', metadata=None, type='ai', tool_calls=None)]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ragas_trace # List of Ragas messages" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "n5mbTp5aZQv6" - }, - "source": [ - "## Evaluating the Agent's Performance" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H885v5sxZQv6" - }, - "source": [ - "For this tutorial, let us evaluate the Agent with the following metrics:\n", - "\n", - "- [Tool call Accuracy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/agents/#tool-call-accuracy):ToolCallAccuracy is a metric that can be used to evaluate the performance of the LLM in identifying and calling the required tools to complete a given task. \n", - "\n", - "- [Agent Goal accuracy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/agents/#agent-goal-accuracy): Agent goal accuracy is a metric that can be used to evaluate the performance of the LLM in identifying and achieving the goals of the user. This is a binary metric, with 1 indicating that the AI has achieved the goal and 0 indicating that the AI has not achieved the goal.\n", - "\n", - "\n", - "First, let us actually run our Agent with a couple of queries, and make sure we have the ground truth labels for these queries." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7kRRIyTAZQv6" - }, - "source": [ - "### Tool Call Accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CC973Yq1ZQv6", - "outputId": "d5bf508d-f3ba-4f2e-a4c6-e6efbf229603" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ragas.metrics import ToolCallAccuracy\n", - "from ragas.dataset_schema import MultiTurnSample\n", - "from ragas.integrations.langgraph import convert_to_ragas_messages\n", - "import ragas.messages as r\n", - "\n", - "\n", - "ragas_trace = convert_to_ragas_messages(\n", - " messages=result[\"messages\"]\n", - ") # List of Ragas messages converted using the Ragas function\n", - "\n", - "sample = MultiTurnSample(\n", - " user_input=ragas_trace,\n", - " reference_tool_calls=[\n", - " r.ToolCall(name=\"get_metal_price\", args={\"metal_name\": \"copper\"})\n", - " ],\n", - ")\n", - "\n", - "tool_accuracy_scorer = ToolCallAccuracy()\n", - "tool_accuracy_scorer.llm = ChatOpenAI(model=\"gpt-4o-mini\")\n", - "await tool_accuracy_scorer.multi_turn_ascore(sample)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Tool Call Accuracy: 1, because the LLM correctly identified and used the necessary tool (get_metal_price) with the correct parameters (i.e., metal name as \"copper\")." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rGOL1CBsZQv6" - }, - "source": [ - "### Agent Goal Accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "FA0kMvTfZQwB" - }, - "outputs": [], - "source": [ - "messages = [HumanMessage(content=\"What is the price of 10 grams of silver?\")]\n", - "\n", - "result = react_graph.invoke({\"messages\": messages})" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "YJr4Hxn8ZQwB", - "outputId": "9797c93b-47a2-4264-b535-f182effb396b" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[HumanMessage(content='What is the price of 10 grams of silver?', id='51a469de-5b7c-4d01-ab71-f8db64c8da49'),\n", - " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_rdplOo95CRwo3mZcPu4dmNxG', 'function': {'arguments': '{\"metal_name\":\"silver\"}', 'name': 'get_metal_price'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 120, 'total_tokens': 137, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-3bb60e27-1275-41f1-a46e-03f77984c9d8-0', tool_calls=[{'name': 'get_metal_price', 'args': {'metal_name': 'silver'}, 'id': 'call_rdplOo95CRwo3mZcPu4dmNxG', 'type': 'tool_call'}], usage_metadata={'input_tokens': 120, 'output_tokens': 17, 'total_tokens': 137}),\n", - " ToolMessage(content='1.0523', name='get_metal_price', id='0b5f9260-df26-4164-b042-6df2e869adfb', tool_call_id='call_rdplOo95CRwo3mZcPu4dmNxG'),\n", - " AIMessage(content='The current price of silver is approximately $1.0523 per gram. Therefore, the price of 10 grams of silver would be about $10.52.', response_metadata={'token_usage': {'completion_tokens': 34, 'prompt_tokens': 151, 'total_tokens': 185, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'stop', 'logprobs': None}, id='run-93e38f71-cc9d-41d6-812a-bfad9f9231b2-0', usage_metadata={'input_tokens': 151, 'output_tokens': 34, 'total_tokens': 185})]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result[\"messages\"] # List of Langchain messages" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "StDNqR2vZQwB", - "outputId": "47e914a4-3e48-4932-8b20-752441b42fd4" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[HumanMessage(content='What is the price of 10 grams of silver?', metadata=None, type='human'),\n", - " AIMessage(content='', metadata=None, type='ai', tool_calls=[ToolCall(name='get_metal_price', args={'metal_name': 'silver'})]),\n", - " ToolMessage(content='1.0523', metadata=None, type='tool'),\n", - " AIMessage(content='The current price of silver is approximately $1.0523 per gram. Therefore, the price of 10 grams of silver would be about $10.52.', metadata=None, type='ai', tool_calls=None)]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ragas.integrations.langgraph import convert_to_ragas_messages\n", - "\n", - "ragas_trace = convert_to_ragas_messages(\n", - " result[\"messages\"]\n", - ") # List of Ragas messages converted using the Ragas function\n", - "ragas_trace" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "c6u9-RYdZQwB", - "outputId": "ebf8fdd8-88fc-47c3-e1e2-b401956c0633" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from ragas.dataset_schema import MultiTurnSample\n", - "from ragas.metrics import AgentGoalAccuracyWithReference\n", - "from ragas.llms import LangchainLLMWrapper\n", - "\n", - "\n", - "sample = MultiTurnSample(\n", - " user_input=ragas_trace,\n", - " reference=\"Price of 10 grams of silver\",\n", - ")\n", - "\n", - "scorer = AgentGoalAccuracyWithReference()\n", - "\n", - "evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o-mini\"))\n", - "scorer.llm = evaluator_llm\n", - "await scorer.multi_turn_ascore(sample)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Agent Goal Accuracy: 1, because the LLM correctly achieved the userโ€™s goal of retrieving the price of 10 grams of silver." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "t1ub1OLYZQvz" + }, + "source": [ + "# Building and Evaluating a ReAct Agent for Fetching Metal Prices\n", + "\n", + "AI agents are becoming increasingly valuable in domains like finance, e-commerce, and customer support. These agents can autonomously interact with APIs, retrieve real-time data, and perform tasks that align with user goals. Evaluating these agents is crucial to ensure they are effective, accurate, and responsive to different inputs.\n", + "\n", + "In this tutorial, we'll:\n", + "\n", + "1. Build a [ReAct agent](https://arxiv.org/abs/2210.03629) to fetch metal prices.\n", + "2. Set up an evaluation pipeline to track key performance metrics.\n", + "3. Run and assess the agent's effectiveness with different queries.\n", + "\n", + "Click the [link](https://colab.research.google.com/github/explodinggradients/ragas/blob/main/docs/howtos/integrations/langgraph_agent_evaluation.ipynb) to open the notebook in Google Colab." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "- Python 3.8+\n", + "- Basic understanding of LangGraph, LangChain and LLMs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Q8Ms4ngAZQv1" + }, + "source": [ + "## Installing Ragas and Other Dependencies\n", + "Install Ragas and Langgraph with pip:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "vQk4aWbpZQv1", + "outputId": "4af0ac60-3d1a-4e41-de6e-d33f74921845" + }, + "outputs": [], + "source": [ + "%pip install langgraph==0.2.44\n", + "%pip install ragas\n", + "%pip install nltk" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eJJ-WKWMZQv2" + }, + "source": [ + "## Building the ReAct Agent" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lAXAIbo7ZQv2" + }, + "source": [ + "### Initializing External Components\n", + "To begin, you have two options for setting up the external components:\n", + "\n", + "1. Use a Live API Key: \n", + "\n", + " - Sign up for an account on [metals.dev](https://metals.dev/) to get your API key. \n", + " \n", + "2. Simulate the API Response: \n", + "\n", + " - Alternatively, you can use a predefined JSON object to simulate the API response. This allows you to get started more quickly without needing a live API key. \n", + "\n", + "\n", + "Choose the method that best fits your needs to proceed with the setup." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PNZijyBXZQv3" + }, + "source": [ + "### Predefined JSON Object to simulate API response\n", + "If you would like to quickly get started without creating an account, you can bypass the setup process and use the predefined JSON object given below that simulates the API response." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "puMC36BPZQv3" + }, + "outputs": [], + "source": [ + "metal_price = {\n", + " \"gold\": 88.1553,\n", + " \"silver\": 1.0523,\n", + " \"platinum\": 32.169,\n", + " \"palladium\": 35.8252,\n", + " \"lbma_gold_am\": 88.3294,\n", + " \"lbma_gold_pm\": 88.2313,\n", + " \"lbma_silver\": 1.0545,\n", + " \"lbma_platinum_am\": 31.99,\n", + " \"lbma_platinum_pm\": 32.2793,\n", + " \"lbma_palladium_am\": 36.0088,\n", + " \"lbma_palladium_pm\": 36.2017,\n", + " \"mcx_gold\": 93.2689,\n", + " \"mcx_gold_am\": 94.281,\n", + " \"mcx_gold_pm\": 94.1764,\n", + " \"mcx_silver\": 1.125,\n", + " \"mcx_silver_am\": 1.1501,\n", + " \"mcx_silver_pm\": 1.1483,\n", + " \"ibja_gold\": 93.2713,\n", + " \"copper\": 0.0098,\n", + " \"aluminum\": 0.0026,\n", + " \"lead\": 0.0021,\n", + " \"nickel\": 0.0159,\n", + " \"zinc\": 0.0031,\n", + " \"lme_copper\": 0.0096,\n", + " \"lme_aluminum\": 0.0026,\n", + " \"lme_lead\": 0.002,\n", + " \"lme_nickel\": 0.0158,\n", + " \"lme_zinc\": 0.0031,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2SduQYJbZQv3" + }, + "source": [ + "### Define the get_metal_price Tool\n", + "\n", + "The get_metal_price tool will be used by the agent to fetch the price of a specified metal. We'll create this tool using the @tool decorator from LangChain.\n", + "\n", + "If you want to use real-time data from the metals.dev API, you can modify the function to make a live request to the API." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "1X2TsFLfZQv3" + }, + "outputs": [], + "source": [ + "from langchain_core.tools import tool\n", + "\n", + "\n", + "# Define the tools for the agent to use\n", + "@tool\n", + "def get_metal_price(metal_name: str) -> float:\n", + " \"\"\"Fetches the current per gram price of the specified metal.\n", + "\n", + " Args:\n", + " metal_name : The name of the metal (e.g., 'gold', 'silver', 'platinum').\n", + "\n", + " Returns:\n", + " float: The current price of the metal in dollars per gram.\n", + "\n", + " Raises:\n", + " KeyError: If the specified metal is not found in the data source.\n", + " \"\"\"\n", + " try:\n", + " metal_name = metal_name.lower().strip()\n", + " if metal_name not in metal_price:\n", + " raise KeyError(\n", + " f\"Metal '{metal_name}' not found. Available metals: {', '.join(metal_price['metals'].keys())}\"\n", + " )\n", + " return metal_price[metal_name]\n", + " except Exception as e:\n", + " raise Exception(f\"Error fetching metal price: {str(e)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j85XikcLZQv4" + }, + "source": [ + "### Binding the Tool to the LLM\n", + "With the get_metal_price tool defined, the next step is to bind it to the ChatOpenAI model. This enables the agent to invoke the tool during its execution based on the user's requests allowing it to interact with external data and perform actions beyond its native capabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "lsxVT0lUZQv4" + }, + "outputs": [], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "\n", + "tools = [get_metal_price]\n", + "llm = ChatOpenAI(model=\"gpt-4o-mini\")\n", + "llm_with_tools = llm.bind_tools(tools)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yuDuSrmQZQv4" + }, + "source": [ + "In LangGraph, state plays a crucial role in tracking and updating information as the graph executes. As different parts of the graph run, the state evolves to reflect the changes and contains information that is passed between nodes.\n", + "\n", + "For example, in a conversational system like this one, the state is used to track the exchanged messages. Each time a new message is generated, it is added to the state and the updated state is passed through the nodes, ensuring the conversation progresses logically.\n", + "\n", + "### Defining the State\n", + "To implement this in LangGraph, we define a state class that maintains a list of messages. Whenever a new message is produced it gets appended to this list, ensuring that the conversation history is continuously updated." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "JHHXxYT1ZQv4" + }, + "outputs": [], + "source": [ + "from langgraph.graph import END\n", + "from langchain_core.messages import AnyMessage\n", + "from langgraph.graph.message import add_messages\n", + "from typing import Annotated\n", + "from typing_extensions import TypedDict\n", + "\n", + "\n", + "class GraphState(TypedDict):\n", + " messages: Annotated[list[AnyMessage], add_messages]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1KGbjrAOZQv4" + }, + "source": [ + "### Defining the should_continue Function\n", + "The `should_continue` function determines whether the conversation should proceed with further tool interactions or end. Specifically, it checks if the last message contains any tool calls (e.g., a request for metal prices).\n", + "\n", + "- If the last message includes tool calls, indicating that the agent has invoked an external tool, the conversation continues and moves to the \"tools\" node.\n", + "- If there are no tool calls, the conversation ends, represented by the END state." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "KjppKPRDZQv4" + }, + "outputs": [], + "source": [ + "# Define the function that determines whether to continue or not\n", + "def should_continue(state: GraphState):\n", + " messages = state[\"messages\"]\n", + " last_message = messages[-1]\n", + " if last_message.tool_calls:\n", + " return \"tools\"\n", + " return END" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZbyJRNRvZQv4" + }, + "source": [ + "### Calling the Model\n", + "The `call_model` function interacts with the Language Model (LLM) to generate a response based on the current state of the conversation. It takes the updated state as input, processes it and returns a model-generated response." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "ZYflc7eZZQv4" + }, + "outputs": [], + "source": [ + "# Define the function that calls the model\n", + "def call_model(state: GraphState):\n", + " messages = state[\"messages\"]\n", + " response = llm_with_tools.invoke(messages)\n", + " return {\"messages\": [response]}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VzxIHVa2ZQv4" + }, + "source": [ + "### Creating the Assistant Node\n", + "The `assistant` node is a key component responsible for processing the current state of the conversation and using the Language Model (LLM) to generate a relevant response. It evaluates the state, determines the appropriate course of action, and invokes the LLM to produce a response that aligns with the ongoing dialogue." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "_fPD6W2SZQv4" + }, + "outputs": [], + "source": [ + "# Node\n", + "def assistant(state: GraphState):\n", + " response = llm_with_tools.invoke(state[\"messages\"])\n", + " return {\"messages\": [response]}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Vc3No3agZQv5" + }, + "source": [ + "### Creating the Tool Node\n", + "The `tool_node` is responsible for managing interactions with external tools, such as fetching metal prices or performing other actions beyond the LLM's native capabilities. The tools themselves are defined earlier in the code, and the tool_node invokes these tools based on the current state and the needs of the conversation." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "vz2qlceBZQv5" + }, + "outputs": [], + "source": [ + "from langgraph.prebuilt import ToolNode\n", + "\n", + "# Node\n", + "tools = [get_metal_price]\n", + "tool_node = ToolNode(tools)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M2FWZfGFZQv5" + }, + "source": [ + "### Building the Graph\n", + "The graph structure is the backbone of the agentic workflow, consisting of interconnected nodes and edges. To construct this graph, we use the StateGraph builder which allows us to define and connect various nodes. Each node represents a step in the process (e.g., the assistant node, tool node) and the edges dictate the flow of execution between these steps." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 266 + }, + "id": "FeGI8G3KZQv5", + "outputId": "4575b3ed-e162-4419-f44f-ff0086aaf546" + }, + "outputs": [ + { + "data": { + "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/4gHYSUNDX1BST0ZJTEUAAQEAAAHIAAAAAAQwAABtbnRyUkdCIFhZWiAH4AABAAEAAAAAAABhY3NwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAA9tYAAQAAAADTLQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlkZXNjAAAA8AAAACRyWFlaAAABFAAAABRnWFlaAAABKAAAABRiWFlaAAABPAAAABR3dHB0AAABUAAAABRyVFJDAAABZAAAAChnVFJDAAABZAAAAChiVFJDAAABZAAAAChjcHJ0AAABjAAAADxtbHVjAAAAAAAAAAEAAAAMZW5VUwAAAAgAAAAcAHMAUgBHAEJYWVogAAAAAAAAb6IAADj1AAADkFhZWiAAAAAAAABimQAAt4UAABjaWFlaIAAAAAAAACSgAAAPhAAAts9YWVogAAAAAAAA9tYAAQAAAADTLXBhcmEAAAAAAAQAAAACZmYAAPKnAAANWQAAE9AAAApbAAAAAAAAAABtbHVjAAAAAAAAAAEAAAAMZW5VUwAAACAAAAAcAEcAbwBvAGcAbABlACAASQBuAGMALgAgADIAMAAxADb/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAD5ANYDASIAAhEBAxEB/8QAHQABAAIDAQEBAQAAAAAAAAAAAAUGAwQHCAECCf/EAFEQAAEEAQIDAgYLDAcGBwAAAAEAAgMEBQYRBxIhEzEVFiJBUZQIFBcyVVZhdNHS0yM1NlRxdYGRk5WytCU3QkNSgpIYJGRylqEzNFNiscHw/8QAGwEBAQADAQEBAAAAAAAAAAAAAAECAwUEBgf/xAAzEQEAAQIBCQUJAQADAAAAAAAAAQIRAwQSITFBUVKR0RQzYXGhBRMVI2KSscHhgSLw8f/aAAwDAQACEQMRAD8A/qmiIgIiICIiAsNq5XpR89ieOuz/ABSvDR+sqDu37uevz47FTGlVrnkt5NrQ5zX/APpQhwLS4d7nuBa3cNAc4u5Ptbh/p+F5llxcF+ydua1fb7ZmcR5y9+5/V0W+KKae8n/IW29u+NWF+F6HrLPpTxqwvwxQ9ZZ9KeKuF+B6HqzPoTxVwvwPQ9WZ9CvyfH0XQeNWF+GKHrLPpTxqwvwxQ9ZZ9KeKuF+B6HqzPoTxVwvwPQ9WZ9CfJ8fQ0HjVhfhih6yz6U8asL8MUPWWfSnirhfgeh6sz6E8VcL8D0PVmfQnyfH0NB41YX4Yoess+lblTIVb7S6rZhstHeYZA4D9S0/FXC/A9D1Zn0LUtaB05bkErsNThnad22K0QhmafkkZs4foKfJnbPp/E0J9FWI7NzSM8MN+1NksPK4RsvT8va1XE7NbKQAHMPQB+24O3NvuXCzrXXRm+MEwIiLWgiIgIiICIiAiIgIiICIiAojV2Yfp/S+VyMQDpq1Z8kTXdxft5IP6dlLqvcQqct7ROZjhaZJm13SsY0blzmeWAB6SW7LbgxE4lMVarwsa0hp/Dx4DDVKEZ5uxZ5cnnkkJ3e8/K5xc4n0kqRWGnaivVILMDueGZjZGO9LSNwf1FZlhVMzVM1a0FUuIHFbS3C6LHv1JkzSfkJHRVIIa01madzW8z+SKFj3kNHUnbYbjchW1cU9krQqPg07k48frBupMc+zJiM5o7HG7NQldG0OZNEA4Ojl6Atc0tPL1LehWI2cp7JjT+N4q6b0m2tetUc3hfC8OTq463ODzyQthaGxwu8lzZHOdISAzZodylwVgtcftBUdct0hZz3tfOvtNotilpzthNhw3bCJzH2XaHcbN59zuBsuUx5fWendd8Ltfax0nlrtuxpGzicxDp6g+4+neklrTDnij3LWu7J43G4aehPnVA4t4/Wep5tTDMYbX+W1Bj9VwW8fUxsEwwsOJguRSRyRtjIjsSGJpJGz5ec9GgDoHpi3x20TT1je0ocpYsahozR17VCnjbVh8DpI2yMLzHE4NYWvb5ZPLuSN9wQIvgLx7xvHPBWblWjdx1yvYsxyV56VlkYjZYkijc2aSJjHuc1gc5jSSwktcAQtbhLp+7jOMXGnJWsbYqQZLLY91W3NA5jbUbMdA0ljiNnta/nb03APMO/dRfsY7GQ0vh8poTMaezWNyWLymUte3rFF7aFmGW9JLG6GxtyPLmzNPKDuOV24GyDuCIiDXyFCvlaFmlbibPVsxuhlif3PY4bOB/KCVEaGvz39Nwi1L29upLNRmlO+8j4ZXRF53/wAXJzfpU+qzw8b2mn5Lg35L921cj5htvHJO90Z2+VnKf0r0U9zVffH7XYsyIi86CIiAiIgIiICIiAiIgIiICIiCqU52aDeaNvaLAOeXU7fXkqbncwynuY3cnkf0btsw7EN7THqvhFobX+RjyWo9JYTP3mxCFlrIUYp5BGCSGhzgTy7ucdvlKtr2NkY5j2h7HDYtcNwR6Cq0/h9joSTjbOQwoP8AdY62+OIejaI7xt/Q0f8AYL0TVRiaa5tPO/8A3/WWiVePsbeFBaG+5vpblBJA8EwbA+f+z8gVm0fw70tw9hsxaY09jNPxWXNdOzG1GQCUjcAuDQN9tz3+lYfEmx8as9+2h+yTxJsfGrPftofsk93h8fpKWjetCKr+JNj41Z79tD9kqnex2Wr8VcHp5mqcx4OuYW/flJlh7TtYZ6bGbfc/e8tiTfp38vUed7vD4/SS0b3VFC6s0XgNd4xuO1HhaGdx7ZBM2rka7Z4w8AgO5XAjcBxG/wApWj4k2PjVnv20P2SeJNj41Z79tD9knu8Pj9JLRvQDfY3cKWBwbw40u0PGzgMTB1G4Ox8n0gfqUnpngroDRmXiyuA0XgcNk4g5sdyjj4oZWhw2cA5rQRuCQVueJNj41Z79tD9kvviBTsO/pDIZXKs337G1deIj+VjOVrh8jgQmZhxrr5R/4Wh+crkPG7t8Nipeeo/mhyGRhd5ELOodFG4d8p7unvBu4kHla6ywQR1oI4YWNiijaGMYwbBrQNgAPMF8q1YaVeOvXhjrwRtDWRRNDWtA7gAOgCyrCuuJjNp1QSIiLUgiIgIiICIiAiIgIiICIiAiIgIiICIiAufZYt937SwJPN4sZfYebb21jd/P+TzfpHn6Cuf5Xf3ftLdW7eLGX6EDf/zWN7vPt+Tp3b+ZB0BERAREQEREBERAREQEREBERAREQEREBERAREQEREBERAXPcsB/tA6VPM0HxXzHk7dT/veM677d36fOP0dCXPctt/tBaV6nm8V8xsOX/i8Z5/8A9/2QdCREQEREBERAREQEREBERAREQEREBERAREQERaeXy1fB46a7aLhDEBuGNLnOJIDWtA7ySQAPOSFYiaptGsbiKlP1Dquby4cVia7HdRHYuyOkaP8A3cse2/pAJHylfnw7rD8Qwfrc32a9fZa98c4Wy7oqR4d1h+IYP1ub7NPDusPxDB+tzfZp2WvfHOCy7rwHrH2e2V097IivibXCud2ocTHc06MfFmA7t5Z7FZzXsd7X35T7XG2w8oPB8wXsXw7rD8Qwfrc32a5BnvY/zah9kHh+LVjH4YZnHVexNQWJDFPM0csU7j2e/Oxp2H/Kz/D1dlr3xzgs9LIqR4d1h+IYP1ub7NPDusPxDB+tzfZp2WvfHOCy7oqR4d1h+IYP1ub7NPDusPxDB+tzfZp2WvfHOCy7oqUzPaua7d+NwsjR3tbdmaT+nsjt+pWPAZyHP0PbEbHwSMeYpq8u3PDI33zHbdOnpG4IIIJBBWqvArw4zp1eE3LJJERaEEREBERAREQEREBERAREQFUuJh2wVEeY5ahuD85jVtVR4m/eKh+dqH8zGvTk3f0ecMqdcNtERepiIiICKJy2qsXgsthsbesmG7mJn16MXZvd2r2RukcNwCG7Ma47uIHTbv6KRt24KFWazZmjr1oWOklmlcGsY0DcucT0AAG5JUGVFr43I1cxjqt+lPHapWomTwTxO5mSRuAc1zT5wQQR+VbCoItXKZWng8bayORtQ0aFWJ009mw8MjijaN3Oc49AAASSVmrzx2oI5oXiSKRoex7e5zSNwQgyLR0Af6V1kPMMszYAf8DVK3lo6A++2s/zvH/I1VZ7uvy/cMo1SuKIi5bEREQEREBERAREQEREBERAVR4m/eKh+dqH8zGrcqjxN+8VD87UP5mNenJu/o84ZU64bapHGvU1PSPDDOZG7NlIIuSOux2EkbHddLLI2KJsTndGuc97W8x6DffzK7qK1TpbFa10/dwecpR5HFXGdnPWl32eNwR1BBBBAIIIIIBBBC9M6mLzLpStxPZkOJvD+nl7uIzEunamSw5y2ddl5aU0kk0bh7adG1zecRjps4MPVpO6zxRal1Nw/vYHS1rWdfUWAz1eTUun8rqD+k3V3QbmCpf3I5H7tla7mbzbOG7AQF2Cn7HXh9RZkBFgXOfkaRx92aW/ZkltQl7X8skjpC55BY3lc4lzQNmkAkL432OfD5mAfhm4KVtR91uQfK3I2hadYawxtkNjte1JDCWjd/QEha82RzHEanhzWq+BGS03qLU8mOyFrK421WzN6UvkMNS04stRc3LJJHKzbmIJ8huzj0Kr+GqZbFaV17ozXuX1W/XE+mb1508uZfNjclCwnexU5SDAQSxrotmbNdts4EleiMZwk0jhYdMQ0MNHUi00+aTFMhlkaK75Y3xyu995Zc2R+5fzHdxPf1WnovgZofh9dtW8HgmVrFisab3z2JrPLXJ5jCwSvcGRk7Esbs07Dp0VzZHG8fVq6V9jvw0wuOvatv5fVUdD2lXx+oJYZnymkJHsFmQuNes1jHOLY9tthyjqVWqWqtaxcOsjp/IagymPyWN4lY7AMuw5Q27UVSZ9ZzojZdG0zbdu8cz2dRsCDsu8wexv4eVdPHBw4KWPGCyy5FE3JWg6tKwODHQP7Xmg2D3DaMtGziNtlt47gHoLEV5IKWAbWhkyFTKvjjtThr7dYh0M5HP1eCAXE+/I8vmUzZHCeKWPt4vTXsgdFSZ3OZLCUtKVszT9v5OaeeCR7LPaR9s5xe6JxgYSxxLdi4bbOIXonhXputpfQmIq1bmQvRSV45+1yV+W5Ju5jTsHyucQ30NB2HmC3Z9AaftZjN5SfGxz3M1RjxuQdK5z2WKzO05Y3MJ5dvusm+wBPN136L8aD4eYHhnhXYnTtSaljzJ2vYy25rHKeVrdmmV7i1oa1oDQQBt0CyiLSLGtHQH321n+d4/5Gqt5aOgPvtrP87x/yNVbJ7uvy/cMo1SuKIi5bEREQEREBERAREQEREBERAVR4m/eKh+dqH8zGrcorU2D8YcPLTbN7WmD45oZuXm7OWN4ewkbjcczRuNxuNxuN1vwKooxaaqtUTCxoloooZ9/UVfyJdJ2rEg6OfSuVnRH5WmSRjtvytB+RanjPmDfbTbo3LvmLXOcWTVHMZy8m4e8TcrXESNIaSCRuQCGkjoZn1R90dSyyIoTwtnviZlfWqX26eFs98TMr61S+3TM+qPujqtk2ihPC2e+JmV9apfbqr3eMdbH8Qsfoexg78WqshUfdrY4z1eaSFm/M7m7blHc47E7kNJA2BTM+qPujqWdDRQnhbPfEzK+tUvt08LZ74mZX1ql9umZ9UfdHUsm0UJ4Wz3xMyvrVL7dPC2e+JmV9apfbpmfVH3R1LJtaOgPvtrP87x/yNVRGP1RlcpI+GHSmRgsNBJiuWK0TmgPczmLe1Lw0ljtncpDgNwSCFbdKYObC0rDrcrJb92c2rJi37Nry1rQ1m/Xla1jW7nbfbfYb7DXiTFGHVEzGnRomJ2xOzyNUJtERcxiIiICIiAiIgIiICIiAiIgIvjnBjS5xDWgbknuCgY32NT2GyRyTUsRBOfeiNzcpGYuhDtyWxczz3crnOiBB7M/dA/M+Qs6lE1bEyy06ZjhlZnIuykilBk8uOEbkl3I07vLeUdowt5yHBstjcVTw8MkNGrFUikmksPbEwNDpJHl8jzt3uc5xJPnJKzVq0NKtFXrxMggiYI44omhrWNA2DQB0AA6bLKgIiIC/njxB9jLxuz3suqmsq2otK1c/OZszi43XbRigqVJYIhA8iv5xYjBABB3fufT/Q5c/wAhyzcfMByhpdX0zkec7nmaJLVHl6d2x7J3+n8qDoCIiAiIgis3p2vmWPla99DJivJWr5WqyP21Va8tLuzc9rhtzMjcWuBa4sbzNcBstV+opcRekhzcUNKpLahq0L0cjntsukb0bIOUdi/nBYASWu5o9ncz+Rs+iAirIqy6Jqh1NktrT9WCxNNWHbWrjHc3aNEI3c57QC9oiAJADGsGwDVYoJ47MLJoniSJ7Q5rm9xB7igyIiICIiAiIgIiICIiAiLFan9q1ppuR8vZsL+SMbudsN9gPOUEBZEOsr1zHu5J8JUdJTyVK5j+eO690bHBjXv8l0bQ883K1wL9m8wMcjDZFA6Dj5NF4R3a5SYyVI5i/Nn/AH3d7Q4iYDoHjm2LR0BGw6AKeQEREBERAXPuHBOq9Q6g1xvzUciIsdiHb7h9GAvInHXbaWWWZwI99G2E+jb96ltS8QsrY0pjJnR4iu8Mz+Qhc5ruXYO9pROHdI8Edo4Hdkbths+RrmXqvXiqQRwQRshhiaGMjjaGtY0DYAAdwA8yDIiIgIiICIiAoG7RfgbdrK0Ws7CeT2xkoXNlke8Nj5eeJrOby+VrByhp5+UDoepnkQa2OyNXMY+rfo2I7dK1E2eCxC4OZLG4BzXNI6EEEEH5Vsqv4WWSjqTMYuR+UtMcGZGGzbiBrxtlLmmvFKO8sdEXlrurRMzYkbBtgQEREBERAREQERQuY1tp7T9oVsnnMdj7JHN2Nm0xj9vTyk77LOmiqubUxeVtdNIqt7qWjvjTiPXY/pVZ4l3+G3FfQmZ0ln9R4qbFZSDsZQy/G17SCHMe07++a9rXDfpu0bgjotvZ8bgnlK5s7kjoXiBpeGWpow6k31NSdLSGKzuQidmJxCXDtnx83O8PjYJWv28qNzXnvKvy/nF7CngvR4K+yJ1ff1Hm8XJj8PTNbE5T2ywRXDM4fdIzvtuI2uDh3tL9j8vvT3UtHfGnEeux/SnZ8bgnlJmzuWlFVvdS0d8acR67H9Ke6lo7404j12P6U7PjcE8pM2dy0qm57O5DUGXk05puXsJIi0ZXM8vM3HsI37KLccr7Lm9zTuImuEjwd445ojJcRqus86zS+ls5UgfLHz28vFPG50LCPeVmu3Esx9OxZGOrtzysdesHg6Gm8XDjsbWbVpw8xbG0kkuc4ue9zjuXOc5znOc4lznOJJJJK1VUVUTauLJaz5gcDQ0xiK2MxlcVqVcEMZzFxJJLnOc5xLnvc4lznuJc5ziSSSSpBEWCCIiAiIgIiICIiCu2yG8Q8UN8yS/F3OkX3tHLNW/8b0Tnm+5+lgn9CsS45k/ZFcKq/EbFQy8T8LE9mNvtfEzO1Bjw4TVBtP8AdOk469mP8Ptj0LsaAiIgIiICIiDSzVx2Pw960wAvggklaD6WtJH/AMKo6SqR1sBSkA5p7MTJ55ndXzSOaC57iepJJ/R3dwVn1V+DGY+ZzfwFV7TX4OYr5pF/AF0MDRhT5rsSSIizQREQEREGrksbWy1OStajEkT/AJdi0jqHNI6tcDsQ4dQQCOq39B5SfNaLwd60/tbM9OJ8sm23O7lG7tvNueu3yrEsPCz+rnTnzGL+FY4unBnwmPxPRdi0oiLnIIiICIq3rrWcGisQLDoxZuTv7KrV5uXtX95JPma0bkn0DYbkgHZh4dWLXFFEXmRM5PLUcJUdbyNyvQqt99PalbGwflc4gKsS8YdHQvLTnIXEdN445Hj9YaQuH5O1azuR8IZWw6/e68skg8mIb+9jb3Mb0HQdTsCST1WNfW4XsPDin5tc38P7cvDuPuzaN+Gm+ry/UT3ZtG/DTfV5fqLhyLd8Dybiq5x0Lw4FxI9jppPVPsxsdqSvcjPD3JSeGMq4RSBsdhh3fBy7c33V/Keg2Ae70L3d7s2jfhpvq8v1Fw5E+B5NxVc46F4dx92bRvw031eX6i+s4yaNe7bw3G35XwyNH6y1cNRPgeTcVXOOheHpbD6gxmoa7p8XkKuQiaeVzq0rZA0+g7HofkKkF5YgMlK9HepTyUb8fvLVchr2/IehDh0HkuBB26gruvDfXw1jSmr22sgy9MNE8bPeytPdKweZpIII72kEdRsTxcu9l1ZLT7yib0+sLr1LkiIuEiL1V+DGY+ZzfwFV7TX4OYr5pF/AFYdVfgxmPmc38BVe01+DmK+aRfwBdHB7mfP9Lsb1h0jIJHQsbLMGksY53KHO26AnY7dfPsV524W8etUYzgrmNZ68xUVivUvW4Ks2Puiazdn8ISV46wh7GNrNnckbXcx5gOYhvVejV57h4Baul0DqXQU+RwsWAdfmy+By0Jldchsm8LkTZ4i0M5WvLmkteSRt0Ck32IsDfZCT6WtZmpxD0wdIWqGFlz8XtXINyEdmtE4Nla14YzaVrnMHJtsecbOIWCvxvzs9iriNT6Om0dNqDF27WEsx5Ntpz3xQ9q6KUNY0wyhh5wAXDyXeVuFG5ngRqji5kM3e4i3MNRdPp2xp+hU086WaOHt3NdJZe+VrCXbxx7MA2AB3J71u47hRrrV+qtNZHX9/BMqaap2oajMCZnvuWJ4DXdPL2jWiMCMv2Y3m6vPldAp/yEHpLjjmNNcMOC2MixbtV6o1XhGTNnyuWFRkj4oInSc072vL5XmQbN2Jds4kjZehMfNPZoVprNY07MkTXy1y8P7J5AJZzDodjuNx0Oy8/WOC2vncEMDw9sUdC6ir4+pJjpJMr7ZaOzY1rKtiPlY4smaA4uA8+3K8Ltmg9P29KaJwGFv5KTMXsdQgqT5CbfnsvZGGukO5J3cQT1JPXqSrTfaJ1YeFn9XOnPmMX8KzLDws/q5058xi/hVxe5nzj8SuxaURFzkEREBcC4s5J2S4iWIHOJixtWOCNp7muk+6PI/KOyB/5Au+rgXFnGuxnEOedzSIsnVjnjee5z4/ubwPyDsj/nC73sXN7Vp12m3p+rrslVkWvkb8WLoz25xKYYWF7xDC+V+w9DGAucfkAJVVHFvT5/us5/07kPsF9vViUUaKpiGtcnODWkkgAdST5lxOl7KDD3chUeyDHnCW7bKkU7M1A695T+RsjqY8sMLiD74uDTuWhXtnFHT997avY5o9uez2fp++xp36dXGAADr3k7KvcPtCau0HFj9Ptfp+9pmhI5sV6Zsovur7ktYWAcnMNwOfm7h73deTErrrqp9zVo22tO637Vin43X68OUyUmli3T2LzMmHuX/CDe0aW2BCJWRcnlN3c0kFzSNyBzAbnX4mcUMxNh9c0dL4Sa5BhaM8V3NNvisas5gL9oRsS98bXNcdi3Y9Ad1nyPCbL2+HWsMAyzSFzMZ2bJ13ue/s2xPtsmAeeTcO5WkbAEb+fzrBqHhprCv484/TlnCyYTVQmmkGTdMyarYlgEUhbyNIe13K09dtj6fPoqnKM2030x4X2/wdH0XPLa0dgpppHzTSUIHvkkcXOc4xtJJJ7yT51MKi4/W+K0bjKGDvtykl3H1oa0zqeFvTxFzY2glsjIS1w+UFZ/dd08f7rO/9O5D7Be2nFw4iImqL+aLmpbRWSdh9e4CyxxaJpzSlA/tslaQB/rEbv8qreFzVbP46O7UFhsDyQBarS15Oh2O7JGtcO7zjqrJonGuzOvcBWY3mbBObspH9hkbSQf8AWYx/mUyiaJwK5q1Wn8Mqdb0giIvzBUXqr8GMx8zm/gKr2mvwcxXzSL+AK05mm7I4i9UYQHzwSRAnzFzSP/tVDSVyOxgacIPJZrQsgsQO6Phka0BzHA9QQf1jYjoQuhgacKY8V2JhERZoIiICIiAsPCz+rnTnzGL+FY8nlK2IqPs2pRHG3oB3ue49A1rR1c4kgBo3JJAHUqQ0Ji58JozCUbTOzswU4mSx778j+Ubt38+x6b/IscXRgz4zH4nquxOoiLnIIiICrmudGQa1w4rPkFa3C/tatrl5jE/u6jpu0jcEb9x6EEAixotmHiVYVcV0TaYHl3K1LWn8h7Qy1c4+515WvO7JR/ijf3PHd3dRuNw09FjXpzJYulmaj6t+pBerP99DZibIw/laQQqxLwg0dK4uOBrtJ67RuewfqBAX1uF7cw5p+bRN/D+locKRdy9xvRvwHF+1k+snuN6N+A4v2sn1lu+OZNw1co6locNRdy9xvRvwHF+1k+snuN6N+A4v2sn1k+OZNw1co6locNRdy9xvRvwHF+1k+svrODujWO38BQO+R73uH6i7ZPjmTcNXKOpaN7hdYS5C8yjRgkv33+9q1wHPPynrs0dR5TiAN+pXduHGgho2jNPaeyfL2+UzyM95G0e9iYe8tBJO56uJJ2A2a2xYjBY3AVzBjKFbHwk7llaJsYcfSdh1Pylb64mXe1Ksrp93RFqfWV1ahERcNBQuY0Vp/UNgWMpg8bkZwOUS2qkcjwPRu4E7KaRZU11UTembSalW9yvRnxTwn7vi+qnuV6M+KeE/d8X1VaUW7tGNxzzlbzvVb3K9GfFPCfu+L6qe5Xoz4p4T93xfVVpRO0Y3HPOS871W9yvRnxTwn7vi+qnuV6M+KeE/d8X1VaUTtGNxzzkvO9B4rQ2nMFZbZx2AxlCw3flmrVI43t379iBuN1OIi1VV1VzeqbprERFgCIiAiIgIiICIiAiIgIiICIiAiIg//9k=", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langgraph.graph import START, StateGraph\n", + "from IPython.display import Image, display\n", + "\n", + "# Define a new graph for the agent\n", + "builder = StateGraph(GraphState)\n", + "\n", + "# Define the two nodes we will cycle between\n", + "builder.add_node(\"assistant\", assistant)\n", + "builder.add_node(\"tools\", tool_node)\n", + "\n", + "# Set the entrypoint as `agent`\n", + "builder.add_edge(START, \"assistant\")\n", + "\n", + "# Making a conditional edge\n", + "# should_continue will determine which node is called next.\n", + "builder.add_conditional_edges(\"assistant\", should_continue, [\"tools\", END])\n", + "\n", + "# Making a normal edge from `tools` to `agent`.\n", + "# The `agent` node will be called after the `tool`.\n", + "builder.add_edge(\"tools\", \"assistant\")\n", + "\n", + "# Compile and display the graph for a visual overview\n", + "react_graph = builder.compile()\n", + "display(Image(react_graph.get_graph(xray=True).draw_mermaid_png()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wlNB4fI4ZQv5" + }, + "source": [ + "To test our setup, we will run the agent with a query. The agent will fetch the price of copper using the metals.dev API." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "rzt0I-n2ZQv5" + }, + "outputs": [], + "source": [ + "from langchain_core.messages import HumanMessage\n", + "\n", + "messages = [HumanMessage(content=\"What is the price of copper?\")]\n", + "result = react_graph.invoke({\"messages\": messages})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "esoHsop8ZQv5", + "outputId": "0d52f2db-f2da-4f5a-943e-e549b731f01e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[HumanMessage(content='What is the price of copper?', id='4122f5d4-e298-49e8-a0e0-c98adda78c6c'),\n", + " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_DkVQBK4UMgiXrpguUS2qC4mA', 'function': {'arguments': '{\"metal_name\":\"copper\"}', 'name': 'get_metal_price'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 116, 'total_tokens': 134, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-0f77b156-e43e-4c1e-bd3a-307333eefb68-0', tool_calls=[{'name': 'get_metal_price', 'args': {'metal_name': 'copper'}, 'id': 'call_DkVQBK4UMgiXrpguUS2qC4mA', 'type': 'tool_call'}], usage_metadata={'input_tokens': 116, 'output_tokens': 18, 'total_tokens': 134}),\n", + " ToolMessage(content='0.0098', name='get_metal_price', id='422c089a-6b76-4e48-952f-8925c3700ae3', tool_call_id='call_DkVQBK4UMgiXrpguUS2qC4mA'),\n", + " AIMessage(content='The price of copper is $0.0098 per gram.', response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 148, 'total_tokens': 162, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'stop', 'logprobs': None}, id='run-67cbf98b-4fa6-431e-9ce4-58697a76c36e-0', usage_metadata={'input_tokens': 148, 'output_tokens': 14, 'total_tokens': 162})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result[\"messages\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wsK_VEDSZQv6" + }, + "source": [ + "### Converting Messages to Ragas Evaluation Format\n", + "\n", + "In the current implementation, the GraphState stores messages exchanged between the human user, the AI (LLM's responses), and any external tools (APIs or services the AI uses) in a list. Each message is an object in LangChain's format\n", + "\n", + "```python\n", + "# Implementation of Graph State\n", + "class GraphState(TypedDict):\n", + " messages: Annotated[list[AnyMessage], add_messages]\n", + "```\n", + "\n", + "Each time a message is exchanged during agent execution, it gets added to the messages list in the GraphState. However, Ragas requires a specific message format for evaluating interactions.\n", + "\n", + "Ragas uses its own format to evaluate agent interactions. So, if you're using LangGraph, you will need to convert the LangChain message objects into Ragas message objects. This allows you to evaluate your AI agents with Ragasโ€™ built-in evaluation tools.\n", + "\n", + "**Goal:** Convert the list of LangChain messages (e.g., HumanMessage, AIMessage, and ToolMessage) into the format expected by Ragas, so the evaluation framework can understand and process them properly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To convert a list of LangChain messages into a format suitable for Ragas evaluation, Ragas provides the function [convert_to_ragas_messages][ragas.integrations.langgraph.convert_to_ragas_messages], which can be used to transform LangChain messages into the format expected by Ragas.\n", + "\n", + "Here's how you can use the function:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.integrations.langgraph import convert_to_ragas_messages\n", + "\n", + "# Assuming 'result[\"messages\"]' contains the list of LangChain messages\n", + "ragas_trace = convert_to_ragas_messages(result[\"messages\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[HumanMessage(content='What is the price of copper?', metadata=None, type='human'),\n", + " AIMessage(content='', metadata=None, type='ai', tool_calls=[ToolCall(name='get_metal_price', args={'metal_name': 'copper'})]),\n", + " ToolMessage(content='0.0098', metadata=None, type='tool'),\n", + " AIMessage(content='The price of copper is $0.0098 per gram.', metadata=None, type='ai', tool_calls=None)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ragas_trace # List of Ragas messages" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n5mbTp5aZQv6" + }, + "source": [ + "## Evaluating the Agent's Performance" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "H885v5sxZQv6" + }, + "source": [ + "For this tutorial, let us evaluate the Agent with the following metrics:\n", + "\n", + "- [Tool call Accuracy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/agents/#tool-call-accuracy):ToolCallAccuracy is a metric that can be used to evaluate the performance of the LLM in identifying and calling the required tools to complete a given task. \n", + "\n", + "- [Agent Goal accuracy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/agents/#agent-goal-accuracy): Agent goal accuracy is a metric that can be used to evaluate the performance of the LLM in identifying and achieving the goals of the user. This is a binary metric, with 1 indicating that the AI has achieved the goal and 0 indicating that the AI has not achieved the goal.\n", + "\n", + "\n", + "First, let us actually run our Agent with a couple of queries, and make sure we have the ground truth labels for these queries." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7kRRIyTAZQv6" + }, + "source": [ + "### Tool Call Accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "CC973Yq1ZQv6", + "outputId": "d5bf508d-f3ba-4f2e-a4c6-e6efbf229603" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "18wmDI0xZQwB" - }, - "source": [ - "## Whatโ€™s next\n", - "๐ŸŽ‰ Congratulations! We have learned how to evaluate an agent using the Ragas evaluation framework." + "data": { + "text/plain": [ + "1.0" ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { + ], + "source": [ + "from ragas.metrics import ToolCallAccuracy\n", + "from ragas.dataset_schema import MultiTurnSample\n", + "from ragas.integrations.langgraph import convert_to_ragas_messages\n", + "import ragas.messages as r\n", + "\n", + "\n", + "ragas_trace = convert_to_ragas_messages(\n", + " messages=result[\"messages\"]\n", + ") # List of Ragas messages converted using the Ragas function\n", + "\n", + "sample = MultiTurnSample(\n", + " user_input=ragas_trace,\n", + " reference_tool_calls=[\n", + " r.ToolCall(name=\"get_metal_price\", args={\"metal_name\": \"copper\"})\n", + " ],\n", + ")\n", + "\n", + "tool_accuracy_scorer = ToolCallAccuracy()\n", + "tool_accuracy_scorer.llm = ChatOpenAI(model=\"gpt-4o-mini\")\n", + "await tool_accuracy_scorer.multi_turn_ascore(sample)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tool Call Accuracy: 1, because the LLM correctly identified and used the necessary tool (get_metal_price) with the correct parameters (i.e., metal name as \"copper\")." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rGOL1CBsZQv6" + }, + "source": [ + "### Agent Goal Accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "FA0kMvTfZQwB" + }, + "outputs": [], + "source": [ + "messages = [HumanMessage(content=\"What is the price of 10 grams of silver?\")]\n", + "\n", + "result = react_graph.invoke({\"messages\": messages})" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "ragas", - "language": "python", - "name": "python3" + "base_uri": "https://localhost:8080/" + }, + "id": "YJr4Hxn8ZQwB", + "outputId": "9797c93b-47a2-4264-b535-f182effb396b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[HumanMessage(content='What is the price of 10 grams of silver?', id='51a469de-5b7c-4d01-ab71-f8db64c8da49'),\n", + " AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_rdplOo95CRwo3mZcPu4dmNxG', 'function': {'arguments': '{\"metal_name\":\"silver\"}', 'name': 'get_metal_price'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 120, 'total_tokens': 137, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-3bb60e27-1275-41f1-a46e-03f77984c9d8-0', tool_calls=[{'name': 'get_metal_price', 'args': {'metal_name': 'silver'}, 'id': 'call_rdplOo95CRwo3mZcPu4dmNxG', 'type': 'tool_call'}], usage_metadata={'input_tokens': 120, 'output_tokens': 17, 'total_tokens': 137}),\n", + " ToolMessage(content='1.0523', name='get_metal_price', id='0b5f9260-df26-4164-b042-6df2e869adfb', tool_call_id='call_rdplOo95CRwo3mZcPu4dmNxG'),\n", + " AIMessage(content='The current price of silver is approximately $1.0523 per gram. Therefore, the price of 10 grams of silver would be about $10.52.', response_metadata={'token_usage': {'completion_tokens': 34, 'prompt_tokens': 151, 'total_tokens': 185, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0ba0d124f1', 'finish_reason': 'stop', 'logprobs': None}, id='run-93e38f71-cc9d-41d6-812a-bfad9f9231b2-0', usage_metadata={'input_tokens': 151, 'output_tokens': 34, 'total_tokens': 185})]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result[\"messages\"] # List of Langchain messages" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "StDNqR2vZQwB", + "outputId": "47e914a4-3e48-4932-8b20-752441b42fd4" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[HumanMessage(content='What is the price of 10 grams of silver?', metadata=None, type='human'),\n", + " AIMessage(content='', metadata=None, type='ai', tool_calls=[ToolCall(name='get_metal_price', args={'metal_name': 'silver'})]),\n", + " ToolMessage(content='1.0523', metadata=None, type='tool'),\n", + " AIMessage(content='The current price of silver is approximately $1.0523 per gram. Therefore, the price of 10 grams of silver would be about $10.52.', metadata=None, type='ai', tool_calls=None)]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from ragas.integrations.langgraph import convert_to_ragas_messages\n", + "\n", + "ragas_trace = convert_to_ragas_messages(\n", + " result[\"messages\"]\n", + ") # List of Ragas messages converted using the Ragas function\n", + "ragas_trace" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" + "id": "c6u9-RYdZQwB", + "outputId": "ebf8fdd8-88fc-47c3-e1e2-b401956c0633" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "from ragas.dataset_schema import MultiTurnSample\n", + "from ragas.metrics import AgentGoalAccuracyWithReference\n", + "from ragas.llms import LangchainLLMWrapper\n", + "\n", + "\n", + "sample = MultiTurnSample(\n", + " user_input=ragas_trace,\n", + " reference=\"Price of 10 grams of silver\",\n", + ")\n", + "\n", + "scorer = AgentGoalAccuracyWithReference()\n", + "\n", + "evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o-mini\"))\n", + "scorer.llm = evaluator_llm\n", + "await scorer.multi_turn_ascore(sample)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Agent Goal Accuracy: 1, because the LLM correctly achieved the userโ€™s goal of retrieving the price of 10 grams of silver." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "18wmDI0xZQwB" + }, + "source": [ + "## Whatโ€™s next\n", + "๐ŸŽ‰ Congratulations! We have learned how to evaluate an agent using the Ragas evaluation framework." + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "ragas", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index 4074d0edc..b1dcef58a 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -98,10 +98,7 @@ async def _decompose_answer_into_statements( assert self.sentence_segmenter is not None, "sentence_segmenter is not set" sentences = self.sentence_segmenter.segment(text) - sentences_with_index = { - i: sentence - for i, sentence in enumerate(sentences) - } + sentences_with_index = {i: sentence for i, sentence in enumerate(sentences)} statements_simplified = await self.statement_prompt.generate( llm=self.llm, diff --git a/src/ragas/testset/synthesizers/generate.py b/src/ragas/testset/synthesizers/generate.py index c006e6c6a..49841a083 100644 --- a/src/ragas/testset/synthesizers/generate.py +++ b/src/ragas/testset/synthesizers/generate.py @@ -10,10 +10,7 @@ from ragas._analytics import TestsetGenerationEvent, track from ragas.callbacks import new_group from ragas.cost import TokenUsageParser -from ragas.embeddings.base import ( - BaseRagasEmbeddings, - LlamaIndexEmbeddingsWrapper, -) +from ragas.embeddings.base import BaseRagasEmbeddings, LlamaIndexEmbeddingsWrapper from ragas.executor import Executor from ragas.llms import BaseRagasLLM, LangchainLLMWrapper, LlamaIndexLLMWrapper from ragas.run_config import RunConfig diff --git a/src/ragas/testset/transforms/__init__.py b/src/ragas/testset/transforms/__init__.py index ccfe73b2c..f8e22040b 100644 --- a/src/ragas/testset/transforms/__init__.py +++ b/src/ragas/testset/transforms/__init__.py @@ -1,4 +1,10 @@ -from .base import BaseGraphTransformation, Extractor, RelationshipBuilder, Splitter, NodeFilter +from .base import ( + BaseGraphTransformation, + Extractor, + NodeFilter, + RelationshipBuilder, + Splitter, +) from .default import default_transforms from .engine import Parallel, Transforms, apply_transforms, rollback_transforms from .extractors import ( @@ -8,12 +14,12 @@ SummaryExtractor, TitleExtractor, ) +from .filters import CustomNodeFilter from .relationship_builders.cosine import ( CosineSimilarityBuilder, SummaryCosineSimilarityBuilder, ) from .splitters import HeadlineSplitter -from .filters import CustomNodeFilter __all__ = [ # base diff --git a/src/ragas/testset/transforms/default.py b/src/ragas/testset/transforms/default.py index 071c42756..eacfe57ea 100644 --- a/src/ragas/testset/transforms/default.py +++ b/src/ragas/testset/transforms/default.py @@ -83,7 +83,9 @@ def summary_filter(node): threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK ) - node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK) + node_filter = CustomNodeFilter( + llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK + ) transforms = [ headline_extractor, diff --git a/src/ragas/testset/transforms/extractors/llm_based.py b/src/ragas/testset/transforms/extractors/llm_based.py index dca9c66f4..4604775a5 100644 --- a/src/ragas/testset/transforms/extractors/llm_based.py +++ b/src/ragas/testset/transforms/extractors/llm_based.py @@ -282,7 +282,9 @@ class TopicDescription(BaseModel): class TopicDescriptionPrompt(PydanticPrompt[StringIO, TopicDescription]): - instruction: str = "Provide a concise description of the main topic(s) discussed in the following text." + instruction: str = ( + "Provide a concise description of the main topic(s) discussed in the following text." + ) input_model: t.Type[StringIO] = StringIO output_model: t.Type[TopicDescription] = TopicDescription examples: t.List[t.Tuple[StringIO, TopicDescription]] = [ diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 33184fb90..f39d92e8b 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -233,6 +233,7 @@ def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> i num_tokens = len(encoding.encode(string)) return num_tokens + def batched(iterable: t.Iterable, n: int) -> t.Iterator[t.Tuple]: """Batch data from the iterable into tuples of length n. The last batch may be shorter than n.""" # batched('ABCDEFG', 3) โ†’ ABC DEF G From 7c76915cff2f5a307fe69bd823ad6b9196f8fdd7 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 13 Nov 2024 15:38:29 +0530 Subject: [PATCH 06/13] feat: add more examples --- src/ragas/metrics/_aspect_critic.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py index a8332a728..f6a4b3647 100644 --- a/src/ragas/metrics/_aspect_critic.py +++ b/src/ragas/metrics/_aspect_critic.py @@ -70,6 +70,18 @@ class SingleTurnAspectCriticPrompt( verdict=1, ), ), + ( + AspectCriticInput( + user_input="Who was the director of Los Alamos Laboratory?", + response="Einstein was the director of Los Alamos Laboratory.", + reference="J. Robert Oppenheimer was the director of Los Alamos Laboratory.", + criteria="Is the output written in perfect grammar", + ), + AspectCriticOutput( + reason="The criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically incorrect.", + verdict=0, + ), + ), ] From 42f390c32fa7ba0f2a7abeb623df98d89cb4b74a Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 14 Nov 2024 17:21:51 +0530 Subject: [PATCH 07/13] feat: improve repr for llm and embeddings --- src/ragas/embeddings/base.py | 6 ++++++ src/ragas/llms/base.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index 9fbafb7cc..9981058cf 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -119,6 +119,9 @@ def set_run_config(self, run_config: RunConfig): self.embeddings.request_timeout = run_config.timeout self.run_config.exception_types = RateLimitError + def __repr__(self) -> str: + return f"{self.__class__.__name__}(embeddings={self.embeddings.__class__.__name__}(...))" + @dataclass class HuggingfaceEmbeddings(BaseRagasEmbeddings): @@ -299,6 +302,9 @@ async def aembed_query(self, text: str) -> t.List[float]: async def aembed_documents(self, texts: t.List[str]) -> t.List[t.List[float]]: return await self.embeddings.aget_text_embedding_batch(texts) + def __repr__(self) -> str: + return f"{self.__class__.__name__}(embeddings={self.embeddings.__class__.__name__}(...))" + def embedding_factory( model: str = "text-embedding-ada-002", run_config: t.Optional[RunConfig] = None diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index d34b9b795..9594d4344 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -45,8 +45,8 @@ def is_multiple_completion_supported(llm: BaseLanguageModel) -> bool: @dataclass class BaseRagasLLM(ABC): - run_config: RunConfig = field(default_factory=RunConfig) - multiple_completion_supported: bool = False + run_config: RunConfig = field(default_factory=RunConfig, repr=False) + multiple_completion_supported: bool = field(default=False, repr=False) def set_run_config(self, run_config: RunConfig): self.run_config = run_config @@ -256,6 +256,9 @@ def set_run_config(self, run_config: RunConfig): self.langchain_llm.request_timeout = run_config.timeout self.run_config.exception_types = RateLimitError + def __repr__(self) -> str: + return f"{self.__class__.__name__}(langchain_llm={self.langchain_llm.__class__.__name__}(...))" + class LlamaIndexLLMWrapper(BaseRagasLLM): """ @@ -336,6 +339,9 @@ async def agenerate_text( return LLMResult(generations=[[Generation(text=li_response.text)]]) + def __repr__(self) -> str: + return f"{self.__class__.__name__}(llm={self.llm.__class__.__name__}(...))" + def llm_factory( model: str = "gpt-4o-mini", From 34945ccedabd1d7f42aef0fde4c55e6cbeb88a05 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 14 Nov 2024 17:35:30 +0530 Subject: [PATCH 08/13] feat: changed required_column setter --- src/ragas/metrics/__init__.py | 16 ++++++++++++++++ src/ragas/metrics/base.py | 17 ++++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 761023fb3..8355772ca 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -58,8 +58,24 @@ from ragas.metrics._summarization import SummarizationScore, summarization_score from ragas.metrics._tool_call_accuracy import ToolCallAccuracy from ragas.metrics._topic_adherence import TopicAdherenceScore +from ragas.metrics.base import ( + Metric, + MetricType, + MetricWithEmbeddings, + MetricWithLLM, + MultiTurnMetric, + SingleTurnMetric, +) __all__ = [ + # basic metrics primitives + "Metric", + "MetricType", + "MetricWithEmbeddings", + "MetricWithLLM", + "SingleTurnMetric", + "MultiTurnMetric", + # specific metrics "AnswerCorrectness", "answer_correctness", "Faithfulness", diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 50c173105..978954ec7 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -83,13 +83,16 @@ def required_columns(self) -> t.Dict[str, t.Set[str]]: return required_columns @required_columns.setter - def required_columns(self, metric_type: MetricType, columns: t.Set[str]): - for column in columns: - if column not in VALID_COLUMNS: - raise ValueError( - f"Invalid column '{column}'. Must be one of {VALID_COLUMNS}" - ) - self._required_columns[metric_type] = columns + def required_columns(self, required_columns: t.Dict[MetricType, t.Set[str]]): + rc = {} + for metric_type, columns in required_columns.items(): + for column in columns: + if column not in VALID_COLUMNS: + raise ValueError( + f"Invalid column '{column}'. Must be one of {VALID_COLUMNS}" + ) + rc[metric_type] = columns + self._required_columns = rc def get_required_columns( self, with_optional: bool = False From 2ee463f1b993a8e614412954383933b9ff17c78e Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 14 Nov 2024 22:15:41 +0530 Subject: [PATCH 09/13] feat: convert simple_criteria and aspect_critic to requirements --- src/ragas/metrics/_aspect_critic.py | 77 ++++++++++++++------------- src/ragas/metrics/_simple_criteria.py | 67 ++++++++++++----------- 2 files changed, 75 insertions(+), 69 deletions(-) diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py index f6a4b3647..61d9da9f9 100644 --- a/src/ragas/metrics/_aspect_critic.py +++ b/src/ragas/metrics/_aspect_critic.py @@ -19,6 +19,7 @@ if t.TYPE_CHECKING: from langchain_core.callbacks.base import Callbacks + from ragas.llms import BaseRagasLLM logger = logging.getLogger(__name__) @@ -105,7 +106,6 @@ class MultiTurnAspectCriticPrompt( ] -@dataclass class AspectCritic(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): """ Judges the submission to give binary results using the criteria specified @@ -123,48 +123,51 @@ class AspectCritic(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): made using majority vote. """ - name: str = field(default="", repr=True) - _required_columns: t.Dict[MetricType, t.Set[str]] = field( - default_factory=lambda: { - MetricType.SINGLE_TURN: { - "user_input:optional", - "response:optional", - "retrieved_contexts:optional", - "reference:optional", - "reference_contexts:optional", - }, - MetricType.MULTI_TURN: { - "user_input", - }, - } - ) - single_turn_prompt: PydanticPrompt = field( - default_factory=lambda: SingleTurnAspectCriticPrompt() - ) - multi_turn_prompt: PydanticPrompt = field( - default_factory=lambda: MultiTurnAspectCriticPrompt() - ) - definition: str = field( - default="check if the response to the user input is correct", repr=True - ) - strictness: int = field(default=1, repr=False) - max_retries: int = 1 - - def __post_init__(self): - if self.name == "": - raise ValueError( - f"{self.__class__.__name__}.__init__() missing required keyword argument: `name`" - ) - if self.definition == "": - raise ValueError( - f"{self.__class__.__name__}.__init__() missing required keyword argument: `definition`" - ) + def __init__( + self, + name: str, + definition: str, + llm: t.Optional[BaseRagasLLM] = None, + required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None, + single_turn_prompt: t.Optional[PydanticPrompt] = None, + multi_turn_prompt: t.Optional[PydanticPrompt] = None, + strictness: int = 1, + max_retries: int = 1, + ): + if required_columns is None: + required_columns = { + MetricType.SINGLE_TURN: { + "user_input:optional", + "response:optional", + "retrieved_contexts:optional", + "reference:optional", + "reference_contexts:optional", + }, + MetricType.MULTI_TURN: { + "user_input", + }, + } + + super().__init__( + name=name, + _required_columns=required_columns, + llm=llm, + ) + + self.definition = definition + self.single_turn_prompt = single_turn_prompt or SingleTurnAspectCriticPrompt() + self.multi_turn_prompt = multi_turn_prompt or MultiTurnAspectCriticPrompt() + self.max_retries = max_retries + self.strictness = strictness # ensure odd number of checks to avoid tie in majority vote. self.strictness = ( self.strictness if self.strictness % 2 != 0 else self.strictness + 1 ) + def __repr__(self) -> str: + return f"{self.name}(definition='{self.definition}', required_columns={self.required_columns}, llm={self.llm})" + def _compute_score( self, safe_loaded_responses: t.List[AspectCriticOutput] ) -> float: diff --git a/src/ragas/metrics/_simple_criteria.py b/src/ragas/metrics/_simple_criteria.py index 968e694e6..f615d6a1a 100644 --- a/src/ragas/metrics/_simple_criteria.py +++ b/src/ragas/metrics/_simple_criteria.py @@ -3,7 +3,6 @@ import logging import typing as t from collections import Counter -from dataclasses import dataclass, field from pydantic import BaseModel, Field @@ -19,6 +18,8 @@ if t.TYPE_CHECKING: from langchain_core.callbacks.base import Callbacks + from ragas.llms import BaseRagasLLM + logger = logging.getLogger(__name__) @@ -121,7 +122,6 @@ class MultiTurnSimpleCriteriaPrompt( ] -@dataclass class SimpleCriteriaScore(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): """ Judges the submission to give binary results using the criteria specified @@ -138,38 +138,41 @@ class SimpleCriteriaScore(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): made using majority vote. """ - name: str = field(default="", repr=True) - _required_columns: t.Dict[MetricType, t.Set[str]] = field( - default_factory=lambda: { - MetricType.SINGLE_TURN: { - "user_input:optional", - "response:optional", - "retrieved_contexts:optional", - "reference:optional", - "reference_contexts:optional", - }, - MetricType.MULTI_TURN: { - "user_input:optional", - "reference:optional", - }, - } - ) - single_turn_prompt: PydanticPrompt = field( - default_factory=lambda: SingleTurnSimpleCriteriaPrompt() - ) - multi_turn_prompt: PydanticPrompt = field( - default_factory=lambda: MultiTurnSimpleCriteriaPrompt() - ) - definition: str = field(default="", repr=True) - strictness: int = field(default=1, repr=False) - max_retries: int = 1 + def __init__( + self, + name: str, + definition: str, + llm: t.Optional[BaseRagasLLM] = None, + required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None, + single_turn_prompt: t.Optional[PydanticPrompt] = None, + multi_turn_prompt: t.Optional[PydanticPrompt] = None, + strictness: int = 1, + ): + if required_columns is None: + required_columns = { + MetricType.SINGLE_TURN: { + "user_input:optional", + "response:optional", + "retrieved_contexts:optional", + "reference:optional", + "reference_contexts:optional", + }, + MetricType.MULTI_TURN: { + "user_input:optional", + "reference:optional", + }, + } + super().__init__( + name=name, + llm=llm, + _required_columns=required_columns, + ) - def __post_init__(self): - if self.name == "": - raise ValueError("Expects a name") - if self.definition == "": - raise ValueError("Expects definition") + self.definition = definition + self.single_turn_prompt = single_turn_prompt or SingleTurnSimpleCriteriaPrompt() + self.multi_turn_prompt = multi_turn_prompt or MultiTurnSimpleCriteriaPrompt() + self.strictness = strictness # ensure odd number of checks to avoid tie in majority vote. self.strictness = ( self.strictness if self.strictness % 2 != 0 else self.strictness + 1 From 80612d1a0e75845d02bbcd4b11dec90573fcb9ad Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Nov 2024 11:41:56 +0530 Subject: [PATCH 10/13] feat: added name to validation --- src/ragas/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/validation.py b/src/ragas/validation.py index a247eed18..d3082d876 100644 --- a/src/ragas/validation.py +++ b/src/ragas/validation.py @@ -79,5 +79,5 @@ def validate_supported_metrics(ds: EvaluationDataset, metrics: t.Sequence[Metric if not flag: raise ValueError( - f"The metric does not support the sample type {data_type}." + f"The metric '{m.name}' does not support the sample type {data_type}." ) From c20f803cdb666d737c053bb968668f4f78a2f439 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Nov 2024 15:53:55 +0530 Subject: [PATCH 11/13] feat: modified the metrics --- src/ragas/dataset_schema.py | 2 +- src/ragas/metrics/__init__.py | 2 + src/ragas/metrics/_aspect_critic.py | 28 ++++++------ src/ragas/metrics/_domain_specific_rubrics.py | 39 +++++++++-------- .../metrics/_instance_specific_rubrics.py | 43 +++++++++++-------- src/ragas/metrics/_simple_criteria.py | 3 ++ 6 files changed, 67 insertions(+), 50 deletions(-) diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index d3b4978d7..b3e07edc4 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -69,7 +69,7 @@ class SingleTurnSample(BaseSample): response: t.Optional[str] = None multi_responses: t.Optional[t.List[str]] = None reference: t.Optional[str] = None - rubric: t.Optional[t.Dict[str, str]] = None + rubrics: t.Optional[t.Dict[str, str]] = None class MultiTurnSample(BaseSample): diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 8355772ca..2f164e980 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -48,6 +48,7 @@ ) from ragas.metrics._noise_sensitivity import NoiseSensitivity from ragas.metrics._rouge_score import RougeScore +from ragas.metrics._simple_criteria import SimpleCriteriaScore from ragas.metrics._sql_semantic_equivalence import LLMSQLEquivalence from ragas.metrics._string import ( DistanceMeasure, @@ -86,6 +87,7 @@ "ContextPrecision", "context_precision", "ContextUtilization", + "SimpleCriteriaScore", "ContextRecall", "context_recall", "AspectCritic", diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py index 61d9da9f9..582d8bb74 100644 --- a/src/ragas/metrics/_aspect_critic.py +++ b/src/ragas/metrics/_aspect_critic.py @@ -134,23 +134,21 @@ def __init__( strictness: int = 1, max_retries: int = 1, ): - if required_columns is None: - required_columns = { - MetricType.SINGLE_TURN: { - "user_input:optional", - "response:optional", - "retrieved_contexts:optional", - "reference:optional", - "reference_contexts:optional", - }, - MetricType.MULTI_TURN: { - "user_input", - }, - } - + self._required_columns = required_columns or { + MetricType.SINGLE_TURN: { + "user_input:optional", + "response:optional", + "retrieved_contexts:optional", + "reference:optional", + "reference_contexts:optional", + }, + MetricType.MULTI_TURN: { + "user_input", + }, + } super().__init__( name=name, - _required_columns=required_columns, + _required_columns=self._required_columns, llm=llm, ) diff --git a/src/ragas/metrics/_domain_specific_rubrics.py b/src/ragas/metrics/_domain_specific_rubrics.py index 129833910..11b9bb269 100644 --- a/src/ragas/metrics/_domain_specific_rubrics.py +++ b/src/ragas/metrics/_domain_specific_rubrics.py @@ -18,6 +18,8 @@ if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks + from ragas.llms import BaseRagasLLM + logger = logging.getLogger(__name__) @@ -119,11 +121,22 @@ class MultiTurnPrompt(PydanticPrompt[MultiTurnInput, ScoreFeedback]): ] -@dataclass class RubricsScore(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): - name: str = "rubrics_score" - _required_columns: t.Dict[MetricType, t.Set[str]] = field( - default_factory=lambda: { + def __init__( + self, + name: str = "domain_specific_rubrics", + rubrics: t.Dict[str, str] = DEFAULT_REFERENCE_FREE_RUBRICS, + llm: t.Optional[BaseRagasLLM] = None, + required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None, + single_turn_prompt: t.Optional[PydanticPrompt] = None, + multi_turn_prompt: t.Optional[PydanticPrompt] = None, + max_retries: int = 1, + ): + self.rubrics = rubrics + self.single_turn_scoring_prompt = single_turn_prompt or SingleTurnPrompt() + self.multi_turn_scoring_prompt = multi_turn_prompt or MultiTurnPrompt() + self.max_retries = max_retries + self._required_columns = required_columns or { MetricType.SINGLE_TURN: { "user_input:optional", "response:optional", @@ -135,19 +148,11 @@ class RubricsScore(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): "user_input:optional", "reference:optional", }, - }, - repr=False, - ) - rubrics: t.Dict[str, str] = field( - default_factory=lambda: DEFAULT_REFERENCE_FREE_RUBRICS - ) - max_retries: int = 1 - single_turn_scoring_prompt: PydanticPrompt[SingleTurnInput, ScoreFeedback] = field( - default_factory=SingleTurnPrompt, repr=False - ) - multi_turn_scoring_prompt: PydanticPrompt[MultiTurnInput, ScoreFeedback] = field( - default_factory=MultiTurnPrompt, repr=False - ) + } + super().__init__(name=name, llm=llm, _required_columns=self._required_columns) + + def __repr__(self) -> str: + return f"{self.name}(required_columns={self.required_columns}, llm={self.llm}), rubrics={self.rubrics}" async def _single_turn_ascore( self, sample: SingleTurnSample, callbacks: Callbacks diff --git a/src/ragas/metrics/_instance_specific_rubrics.py b/src/ragas/metrics/_instance_specific_rubrics.py index f26609e62..d9d126017 100644 --- a/src/ragas/metrics/_instance_specific_rubrics.py +++ b/src/ragas/metrics/_instance_specific_rubrics.py @@ -1,7 +1,6 @@ from __future__ import annotations import typing as t -from dataclasses import dataclass, field from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.metrics._domain_specific_rubrics import ( @@ -21,12 +20,20 @@ if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks + from ragas.llms import BaseRagasLLM + -@dataclass class InstanceRubrics(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): - name: str = "labelled_rubrics_score" - _required_columns: t.Dict[MetricType, t.Set[str]] = field( - default_factory=lambda: { + def __init__( + self, + name: str = "instance_rubrics", + llm: t.Optional[BaseRagasLLM] = None, + required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None, + single_turn_prompt: t.Optional[PydanticPrompt] = None, + multi_turn_prompt: t.Optional[PydanticPrompt] = None, + max_retries: int = 1, + ): + self._required_columns = required_columns or { MetricType.SINGLE_TURN: { "rubrics", "user_input:optional", @@ -40,30 +47,32 @@ class InstanceRubrics(MetricWithLLM, SingleTurnMetric, MultiTurnMetric): "user_input:optional", "reference:optional", }, - }, - repr=False, - ) - single_turn_prompt: PydanticPrompt = field( - default_factory=lambda: SingleTurnPrompt() - ) - multi_turn_prompt: PydanticPrompt = field(default_factory=lambda: MultiTurnPrompt()) + } + super().__init__(name=name, llm=llm, _required_columns=self._required_columns) + + self.single_turn_prompt = single_turn_prompt or SingleTurnPrompt() + self.multi_turn_prompt = multi_turn_prompt or MultiTurnPrompt() + self.max_retries = max_retries - max_retries: int = 1 + def __repr__(self) -> str: + return f"{self.name}(required_columns={self.required_columns}, llm={self.llm})" async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not set" user_input, contexts, response, reference, rubrics = ( - row["user_input"], + row.get("user_input"), row.get("retrieved_contexts"), - row["response"], - row["reference"], - row["rubrics"], + row.get("response"), + row.get("reference"), + row.get("rubrics"), ) if contexts is not None: contexts = "\n".join(contexts) user_input = f"{user_input} answer using context: {contexts}" + if rubrics is None: + raise ValueError(f"Rubrics are not set for the sample: {row}") prompt_input = SingleTurnInput( user_input=user_input, response=response, diff --git a/src/ragas/metrics/_simple_criteria.py b/src/ragas/metrics/_simple_criteria.py index f615d6a1a..e6bee0842 100644 --- a/src/ragas/metrics/_simple_criteria.py +++ b/src/ragas/metrics/_simple_criteria.py @@ -178,6 +178,9 @@ def __init__( self.strictness if self.strictness % 2 != 0 else self.strictness + 1 ) + def __repr__(self) -> str: + return f"{self.name}(required_columns={self.required_columns}, llm={self.llm}, definition={self.definition})" + def _compute_score( self, safe_loaded_responses: t.List[SimpleCriteriaOutput] ) -> float: From 82b4b79c026be19b7b97dbac146019a251d13954 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Nov 2024 15:55:12 +0530 Subject: [PATCH 12/13] docs: update the docs --- .../available_metrics/general_purpose.md | 121 ++---------------- docs/howtos/customizations/metrics/_cost.md | 11 +- .../metrics/_write_your_own_metric.md | 4 +- .../metrics/write_your_own_metric.ipynb | 4 +- .../testgenerator/_persona_generator.md | 16 ++- .../_langgraph_agent_evaluation.md | 2 +- 6 files changed, 35 insertions(+), 123 deletions(-) diff --git a/docs/concepts/metrics/available_metrics/general_purpose.md b/docs/concepts/metrics/available_metrics/general_purpose.md index 78c25d073..06bcfd6b3 100644 --- a/docs/concepts/metrics/available_metrics/general_purpose.md +++ b/docs/concepts/metrics/available_metrics/general_purpose.md @@ -6,7 +6,6 @@ General purpose evaluation metrics are used to evaluate any given task. `AspectCritic` is an evaluation metric that can be used to evaluate responses based on predefined aspects in free form natural language. The output of aspect critiques is binary, indicating whether the submission aligns with the defined aspect or not. -**Without reference** ### Example @@ -28,32 +27,6 @@ scorer = AspectCritic( await scorer.single_turn_ascore(sample) ``` -**With reference** - -### Example - -```python -from ragas.dataset_schema import SingleTurnSample -from ragas.metrics import AspectCriticWithReference - - -sample = SingleTurnSample( - user_input="Where is the Eiffel Tower located?", - response="The Eiffel Tower is located in Paris.", - reference="The Eiffel Tower is located in Paris.", -) - -scorer = AspectCritic( - name="correctness", - definition="Is the response factually similar to the reference?", - llm=evaluator_llm - - ) - -await scorer.single_turn_ascore(sample) - -``` - ### How it works Critics are essentially basic LLM calls using the defined criteria. For example, let's see how the harmfulness critic works: @@ -74,41 +47,22 @@ Critics are essentially basic LLM calls using the defined criteria. For example, Course graned evaluation method is an evaluation metric that can be used to score (integer) responses based on predefined single free form scoring criteria. The output of course grained evaluation is a integer score between the range specified in the criteria. -**Without Reference** - -```python -from ragas.dataset_schema import SingleTurnSample -from ragas.metrics import SimpleCriteriaScoreWithoutReference - - -sample = SingleTurnSample( - user_input="Where is the Eiffel Tower located?", - response="The Eiffel Tower is located in Paris.", -) - -scorer = SimpleCriteriaScoreWithoutReference(name="course_grained_score", - definition="Score 0 to 5 for correctness", - llm=evaluator_llm -) -await scorer.single_turn_ascore(sample) -``` - -**With Reference** - ```python from ragas.dataset_schema import SingleTurnSample -from ragas.metrics import SimpleCriteriaScoreWithReference +from ragas.metrics import SimpleCriteriaScore sample = SingleTurnSample( - user_input="Where is the Eiffel Tower located?", + user_input="Where is the Eiffel Tower loc response="The Eiffel Tower is located in Paris.", reference="The Eiffel Tower is located in Egypt" ) -scorer = SimpleCriteriaScoreWithReference(name="course_grained_score", - definition="Score 0 to 5 by similarity", - llm=evaluator_llm) +scorer = SimpleCriteriaScore( + name="course_grained_score", + definition="Score 0 to 5 by similarity", + llm=evaluator_llm +) await scorer.single_turn_ascore(sample) ``` @@ -117,14 +71,10 @@ await scorer.single_turn_ascore(sample) Domain specific evaluation metric is a rubric-based evaluation metric that is used to evaluate responses on a specific domain. The rubric consists of descriptions for each score, typically ranging from 1 to 5. The response here is evaluation and scored using the LLM using description specified in the rubric. This metric also have reference free and reference based variations. -### With Reference - -Used when you have reference answer to evaluate the responses against. - #### Example ```python from ragas.dataset_schema import SingleTurnSample -from ragas.metrics import RubricsScoreWithReference +from ragas.metrics import RubricsScore sample = SingleTurnSample( user_input="Where is the Eiffel Tower located?", response="The Eiffel Tower is located in Paris.", @@ -137,67 +87,18 @@ rubrics = { "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.", "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.", } -scorer = RubricsScoreWithReference(rubrics=rubrics, llm=evaluator_llm) +scorer = RubricsScore(rubrics=rubrics, llm=evaluator_llm) await scorer.single_turn_ascore(sample) ``` -### Without Reference - -Used when you don't have reference answer to evaluate the responses against. - -#### Example -```python -from ragas.dataset_schema import SingleTurnSample -from ragas.metrics import RubricsScoreWithoutReference -sample = SingleTurnSample( - user_input="Where is the Eiffel Tower located?", - response="The Eiffel Tower is located in Paris.", -) - -scorer = RubricsScoreWithoutReference(rubrics=rubrics, llm=evaluator_llm) -await scorer.single_turn_ascore(sample) -``` - - ## Instance Specific rubrics criteria scoring Instance specific evaluation metric is a rubric-based evaluation metric that is used to evaluate responses on a specific instance, ie each instance to be evaluated is annotated with a rubric based evaluation criteria. The rubric consists of descriptions for each score, typically ranging from 1 to 5. The response here is evaluation and scored using the LLM using description specified in the rubric. This metric also have reference free and reference based variations. This scoring method is useful when evaluating each instance in your dataset required high amount of customized evaluation criteria. -### With Reference - -Used when you have reference answer to evaluate the responses against. - -#### Example -```python -from ragas.dataset_schema import SingleTurnSample -from ragas.metrics import InstanceRubricsWithReference - - -SingleTurnSample( - user_input="Where is the Eiffel Tower located?", - response="The Eiffel Tower is located in Paris.", - reference="The Eiffel Tower is located in Paris.", - rubrics = { - "score1": "The response is completely incorrect or irrelevant (e.g., 'The Eiffel Tower is in London.' or no mention of the Eiffel Tower).", - "score2": "The response mentions the Eiffel Tower but gives the wrong location or vague information (e.g., 'The Eiffel Tower is in Europe.' or 'It is in France.' without specifying Paris).", - "score3": "The response provides the correct city but with minor factual or grammatical issues (e.g., 'The Eiffel Tower is in Paris, Germany.' or 'The tower is located at Paris.').", - "score4": "The response is correct but lacks some clarity or extra detail (e.g., 'The Eiffel Tower is in Paris, France.' without other useful context or slightly awkward phrasing).", - "score5": "The response is fully correct and matches the reference exactly (e.g., 'The Eiffel Tower is located in Paris.' with no errors or unnecessary details)." - } -) - -scorer = InstanceRubricsWithReference(llm=evaluator_llm) -await scorer.single_turn_ascore(sample) -``` - -### Without Reference - -Used when you don't have reference answer to evaluate the responses against. - #### Example ```python from ragas.dataset_schema import SingleTurnSample -from ragas.metrics import InstanceRubricsScoreWithoutReference +from ragas.metrics import InstanceRubricsScore SingleTurnSample( @@ -212,6 +113,6 @@ SingleTurnSample( } ) -scorer = InstanceRubricsScoreWithoutReference(llm=evaluator_llm) +scorer = InstanceRubricsScore(llm=evaluator_llm) await scorer.single_turn_ascore(sample) ``` diff --git a/docs/howtos/customizations/metrics/_cost.md b/docs/howtos/customizations/metrics/_cost.md index d160bc61c..3cd5501a5 100644 --- a/docs/howtos/customizations/metrics/_cost.md +++ b/docs/howtos/customizations/metrics/_cost.md @@ -13,6 +13,7 @@ For an example here is one that will parse OpenAI by using a parser we have defi ```python import os + os.environ["OPENAI_API_KEY"] = "your-api-key" ``` @@ -61,8 +62,6 @@ metric = AspectCriticWithReference( name="answer_correctness", definition="is the response correct compared to reference", ) - - ``` Repo card metadata block was not found. Setting CardData to empty. @@ -73,8 +72,12 @@ metric = AspectCriticWithReference( from ragas import evaluate from ragas.cost import get_token_usage_for_openai -results = evaluate(eval_dataset[:5], metrics=[metric], llm=gpt4o, - token_usage_parser=get_token_usage_for_openai,) +results = evaluate( + eval_dataset[:5], + metrics=[metric], + llm=gpt4o, + token_usage_parser=get_token_usage_for_openai, +) ``` Evaluating: 100%|โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 5/5 [00:01<00:00, 2.81it/s] diff --git a/docs/howtos/customizations/metrics/_write_your_own_metric.md b/docs/howtos/customizations/metrics/_write_your_own_metric.md index 4913309b1..0df90e446 100644 --- a/docs/howtos/customizations/metrics/_write_your_own_metric.md +++ b/docs/howtos/customizations/metrics/_write_your_own_metric.md @@ -90,9 +90,9 @@ Now lets init the metric with the rubric and evaluator llm and evaluate the data ```python -from ragas.metrics import RubricsScoreWithoutReference +from ragas.metrics import RubricsScore -hallucinations_rubric = RubricsScoreWithoutReference( +hallucinations_rubric = RubricsScore( name="hallucinations_rubric", llm=evaluator_llm, rubrics=rubric ) diff --git a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb index 66407cfe6..131994797 100644 --- a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb +++ b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb @@ -160,9 +160,9 @@ } ], "source": [ - "from ragas.metrics import RubricsScoreWithoutReference\n", + "from ragas.metrics import RubricsScore\n", "\n", - "hallucinations_rubric = RubricsScoreWithoutReference(\n", + "hallucinations_rubric = RubricsScore(\n", " name=\"hallucinations_rubric\", llm=evaluator_llm, rubrics=rubric\n", ")\n", "\n", diff --git a/docs/howtos/customizations/testgenerator/_persona_generator.md b/docs/howtos/customizations/testgenerator/_persona_generator.md index d4c6d0db0..d0d32824c 100644 --- a/docs/howtos/customizations/testgenerator/_persona_generator.md +++ b/docs/howtos/customizations/testgenerator/_persona_generator.md @@ -14,9 +14,18 @@ Which we can define as follows: ```python from ragas.testset.persona import Persona -persona_new_joinee = Persona(name="New Joinee", role_description="Don't know much about the company and is looking for information on how to get started.") -persona_manager = Persona(name="Manager", role_description="Wants to know about the different teams and how they collaborate with each other.") -persona_senior_manager = Persona(name="Senior Manager", role_description="Wants to know about the company vision and how it is executed.") +persona_new_joinee = Persona( + name="New Joinee", + role_description="Don't know much about the company and is looking for information on how to get started.", +) +persona_manager = Persona( + name="Manager", + role_description="Wants to know about the different teams and how they collaborate with each other.", +) +persona_senior_manager = Persona( + name="Senior Manager", + role_description="Wants to know about the company vision and how it is executed.", +) personas = [persona_new_joinee, persona_manager, persona_senior_manager] personas @@ -49,7 +58,6 @@ testset_generator = TestsetGenerator(knowledge_graph=kg, persona_list=personas, # Generate the Testset testset = testset_generator.generate(testset_size=10) testset - ``` diff --git a/docs/howtos/integrations/_langgraph_agent_evaluation.md b/docs/howtos/integrations/_langgraph_agent_evaluation.md index 800f0678c..a694db948 100644 --- a/docs/howtos/integrations/_langgraph_agent_evaluation.md +++ b/docs/howtos/integrations/_langgraph_agent_evaluation.md @@ -289,7 +289,7 @@ ragas_trace = convert_to_ragas_messages(result["messages"]) ```python -ragas_trace # List of Ragas messages +ragas_trace # List of Ragas messages ``` From b731e4c53bbafd6a86c50103e1459f048ff29df8 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Nov 2024 16:35:44 +0530 Subject: [PATCH 13/13] fix: ci issues --- src/ragas/metrics/_answer_similarity.py | 6 +----- src/ragas/metrics/_aspect_critic.py | 1 - src/ragas/metrics/_domain_specific_rubrics.py | 1 - src/ragas/testset/transforms/extractors/llm_based.py | 7 ++----- 4 files changed, 3 insertions(+), 12 deletions(-) diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 83f0caabc..6a6f1ba0e 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -8,11 +8,7 @@ from ragas.dataset_schema import SingleTurnSample from ragas.embeddings.base import HuggingfaceEmbeddings -from ragas.metrics.base import ( - MetricType, - MetricWithEmbeddings, - SingleTurnMetric, -) +from ragas.metrics.base import MetricType, MetricWithEmbeddings, SingleTurnMetric if t.TYPE_CHECKING: from langchain_core.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py index 582d8bb74..93b95855b 100644 --- a/src/ragas/metrics/_aspect_critic.py +++ b/src/ragas/metrics/_aspect_critic.py @@ -3,7 +3,6 @@ import logging import typing as t from collections import Counter -from dataclasses import dataclass, field from pydantic import BaseModel, Field diff --git a/src/ragas/metrics/_domain_specific_rubrics.py b/src/ragas/metrics/_domain_specific_rubrics.py index 11b9bb269..8cf1dfb08 100644 --- a/src/ragas/metrics/_domain_specific_rubrics.py +++ b/src/ragas/metrics/_domain_specific_rubrics.py @@ -2,7 +2,6 @@ import logging import typing as t -from dataclasses import dataclass, field from pydantic import BaseModel, Field diff --git a/src/ragas/testset/transforms/extractors/llm_based.py b/src/ragas/testset/transforms/extractors/llm_based.py index 21ec066a2..e5fea0c9e 100644 --- a/src/ragas/testset/transforms/extractors/llm_based.py +++ b/src/ragas/testset/transforms/extractors/llm_based.py @@ -114,7 +114,7 @@ class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines "Introduction", "Main Concepts", "Detailed Analysis", - "Subsection: Specialized Techniques" + "Subsection: Specialized Techniques", "Future Directions", "Conclusion", ], @@ -212,7 +212,6 @@ async def extract(self, node: Node) -> t.Tuple[str, t.Any]: return self.property_name, keyphrases - @dataclass class TitleExtractor(LLMBasedExtractor): """ @@ -307,9 +306,7 @@ class TopicDescription(BaseModel): class TopicDescriptionPrompt(PydanticPrompt[StringIO, TopicDescription]): - instruction: str = ( - "Provide a concise description of the main topic(s) discussed in the following text." - ) + instruction: str = "Provide a concise description of the main topic(s) discussed in the following text." input_model: t.Type[StringIO] = StringIO output_model: t.Type[TopicDescription] = TopicDescription examples: t.List[t.Tuple[StringIO, TopicDescription]] = [