diff --git a/backend/promptfoo/intent_config.yaml b/backend/promptfoo/intent_config.yaml index 353f5703..b6976185 100644 --- a/backend/promptfoo/intent_config.yaml +++ b/backend/promptfoo/intent_config.yaml @@ -1,4 +1,4 @@ -description: "Intent" +description: 'Intent' providers: - id: openai:gpt-4o-mini @@ -8,50 +8,50 @@ providers: prompts: file://promptfoo_test_runner.py:create_prompt tests: - - description: "questions directed towards the database lookups should have only 1 question -1" + - description: 'questions directed towards the database lookups should have only 1 question -1' vars: - system_prompt_template: "intent-system" - user_prompt_template: "intent" + system_prompt_template: 'intent-system' + user_prompt_template: 'intent' user_prompt_args: chat_history: [] - question: "Check the database and tell me the average ESG score (Environmental) for the WhiteRock ETF fund" + question: 'Check the database and tell me the average ESG score (Environmental) for the WhiteRock ETF fund' assert: - type: javascript value: JSON.parse(output).questions.length === 0 - - description: "questions directed towards the database look ups should have only 1 question -2" + - description: 'questions directed towards the database look ups should have only 1 question -2' vars: - system_prompt_template: "intent-system" - user_prompt_template: "intent" + system_prompt_template: 'intent-system' + user_prompt_template: 'intent' user_prompt_args: chat_history: [] - question: "Using Bloomberg.csv dataset give me the company with the best esg score" + question: 'Using Bloomberg.csv dataset give me the company with the best esg score' assert: - type: javascript value: JSON.parse(output).questions.length === 0 - - description: "verify that the correct company name is determined from the chat history" + - description: 'verify that the correct company name is determined from the chat history' vars: - system_prompt_template: "intent-system" - user_prompt_template: "intent" + system_prompt_template: 'intent-system' + user_prompt_template: 'intent' user_prompt_args: chat_history: | [ "User: When was Coca Cola founded?", "System: Coca-Cola was founded on May 8, 1886.", ] - question: "What is their best selling product?" + question: 'What is their best selling product?' assert: - type: javascript value: output.includes("Coca-Cola") || output.includes("Coca Cola") - - description: "verify that the question is correctly split up" + - description: 'verify that the question is correctly split up' vars: - system_prompt_template: "intent-system" - user_prompt_template: "intent" + system_prompt_template: 'intent-system' + user_prompt_template: 'intent' user_prompt_args: chat_history: [] - question: "Compare Ryanair emissions to other companies in the industry" + question: 'Compare Ryanair emissions to other companies in the industry' assert: - type: javascript value: JSON.parse(output).questions[0].includes("Ryanair") @@ -59,3 +59,44 @@ tests: value: The 1st item in the questions array contains a question about finding the emissions for Ryanair - type: llm-rubric value: The 2nd item in the questions array contains a question about finding the emissions for companies in the industry + + - description: 'verify intent for finding ESG scores online in the Technology sector' + vars: + system_prompt_template: 'intent-system' + user_prompt_template: 'intent' + user_prompt_args: + chat_history: [] + question: 'provide a list of companies with the highest ESG scores in the Technology sector?' + assert: + - type: javascript + value: JSON.parse(output).user_intent.includes("Technology sector") + - type: javascript + value: JSON.parse(output).questions[0].includes("highest ESG scores") + - type: llm-rubric + value: The output correctly identifies the intent to search online for companies in the Technology sector with high ESG scores. + + - description: 'Validation - General information is rejected' + vars: + system_prompt_template: 'validator' + user_prompt_template: 'validate' + user_prompt_args: + task: 'Provide a list of companies with the highest ESG scores in the Technology sector.' + answer: "As of the end of 2023, the Technology sector had the highest weighted-average ESG score among all sectors, according to the MSCI ACWI SRI Index. However, I don't have a specific list of individual companies with the highest scores." + assert: + - type: javascript + value: JSON.parse(output).response === "false" + - type: llm-rubric + value: The reasoning should explain that general sector information is insufficient to fulfill the task. + + - description: 'Validation - Incorrect company is rejected' + vars: + system_prompt_template: 'validator' + user_prompt_template: 'validate' + user_prompt_args: + task: "What are Apple's ESG scores?" + answer: "Microsoft's ESG (Environmental, Social, and Governance) scores are as follows: Environmental Score of 95.0, Social Score of 90.0, Governance Score of 92.0." + assert: + - type: javascript + value: JSON.parse(output).response === "false" + - type: llm-rubric + value: The reasoning should explain that the scores provided do not match Apple's scores as requested. diff --git a/backend/src/agents/web_agent.py b/backend/src/agents/web_agent.py index 8a520792..efd51d15 100644 --- a/backend/src/agents/web_agent.py +++ b/backend/src/agents/web_agent.py @@ -86,7 +86,7 @@ async def web_general_search_core(search_query, llm, model) -> str: continue # Skip if the summarization is not valid response = { "content": summary, - "ignore_validation": "false" + "ignore_validation": "true" # This is to ignore the validation of the answer again by the supervisor } return json.dumps(response, indent=4) return "No relevant information found on the internet for the given query." diff --git a/backend/src/prompts/templates/intent-system.j2 b/backend/src/prompts/templates/intent-system.j2 index b515f6a4..08ebbda5 100644 --- a/backend/src/prompts/templates/intent-system.j2 +++ b/backend/src/prompts/templates/intent-system.j2 @@ -13,4 +13,7 @@ Output your result in the following json format: "question": "string of the original question", "user_intent": "string of the intent of the user's question", "questions": array of singular objective questions or if the question mentions csv, dataset or database an empty array -} \ No newline at end of file +} + +Guidelines: +- If the user has asked to check online, then each question in the questions array should also specify that. \ No newline at end of file diff --git a/backend/src/prompts/templates/validator.j2 b/backend/src/prompts/templates/validator.j2 index 8992cc09..ac890dd0 100644 --- a/backend/src/prompts/templates/validator.j2 +++ b/backend/src/prompts/templates/validator.j2 @@ -2,7 +2,7 @@ You are an expert validator. You can help with validating the answers to the tas Your entire purpose is to return a "true" or "false" value to indicate if the answer has fulfilled the task, along with a reasoning to explain your decision. -You will be passed a task and an answer. You need to determine if the answer is correct or not. +You will be passed a task and an answer. You need to determine if the answer is correct or not, ensuring that the task's specific requirements are addressed. Output format: @@ -14,10 +14,13 @@ json } **Validation Guidelines:** -- Be lenient - if the answer looks reasonably accurate, return "true". -- If multiple entities have the same highest score and this matches the query intent, return "true". +- The answer must fulfill the specific intent of the task, not just provide related information. +- Be lenient if the answer is reasonably accurate and fulfills the task's intent, even if it lacks minor details. +- If specific data (like a list of companies) is requested but missing, return "false." +- If multiple entities have the same highest score and this matches the query intent, return "true." - Spending is negative; ensure any calculations involving spending reflect this if relevant to the task. + Example: Task: What is 2 + 2? Answer: 4 @@ -33,6 +36,20 @@ Answer: 5 "reasoning": "The answer is incorrect; 2 + 2 equals 4, not 5." } +Task: Provide a list of companies with the highest ESG scores in the Technology sector. +Answer: As of the end of 2023, the Technology sector had the highest weighted-average ESG score among all sectors, according to the MSCI ACWI SRI Index. However, I don't have a specific list of individual companies with the highest scores. +{ + "response": "false", + "reasoning": "The answer provides general information about ESG scores in the Technology sector but fails to fulfill the task's intent of listing companies with the highest scores." +} + +Task: Provide a list of companies with the highest ESG scores in the Technology sector. +Answer: Here are the companies with the highest ESG scores in the Technology sector: 1. Apple Inc., 2. Microsoft Corp., 3. Alphabet Inc. +{ + "response": "true", + "reasoning": "The answer lists companies with the highest ESG scores in the Technology sector, fulfilling the task's intent." +} + Task: What are Apple's ESG scores? Answer: Apple's ESG (Environmental, Social, and Governance) scores are as follows: Environmental Score of 95.0, Social Score of 90.0, Governance Score of 92.0. { diff --git a/backend/src/utils/web_utils.py b/backend/src/utils/web_utils.py index e83d43ee..f53adc1d 100644 --- a/backend/src/utils/web_utils.py +++ b/backend/src/utils/web_utils.py @@ -13,7 +13,7 @@ engine = PromptEngine() -async def search_urls(search_query, num_results=10) -> str: +async def search_urls(search_query, num_results=30) -> str: logger.info(f"Searching the web for: {search_query}") try: https_urls = [str(url) for url in search(search_query, num_results=num_results) if str(url).startswith("https")]