diff --git a/.github/workflows/test-backend.yml b/.github/workflows/test-backend.yml index 84d82744..cd1f3ebc 100644 --- a/.github/workflows/test-backend.yml +++ b/.github/workflows/test-backend.yml @@ -1,5 +1,6 @@ name: Test Backend -on: +on: + workflow_dispatch: pull_request: push: branches: @@ -33,4 +34,5 @@ jobs: emoji: true verbose: true job-summary: true + custom-arguments: '--ignore=tests/BDD' report-title: 'Backend Test Report' diff --git a/backend/src/utils/config.py b/backend/src/utils/config.py index cbf1cdde..5f7e4b8b 100644 --- a/backend/src/utils/config.py +++ b/backend/src/utils/config.py @@ -4,7 +4,7 @@ default_frontend_url = "http://localhost:8650" default_neo4j_uri = "bolt://localhost:7687" default_files_directory = "files" -default_redis_host = "redis" +default_redis_host = "localhost" default_redis_cache_duration = 3600 diff --git a/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature b/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature index 9d0eb168..3bbb27e0 100644 --- a/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature +++ b/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature @@ -5,29 +5,14 @@ Scenario Outline: When a user asks InferESG for information about their transact Then the response to this '' should match the '' Examples: |prompt |expected_response | -# |How much did I spend at Tesco? |639.84| -|Check the database and tell me the fund with the highest ESG social score |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67.| -|Check the database and tell me the fund with the highest ESG social score |Highest ESG score is Dynamic Industries with a score of 91| +|Check the database and tell me the average ESG score (Environmental) for the WhiteRock ETF fund |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67| +|Check the database and tell me the fund with the highest ESG social score |Dynamic Industries with a score of 91| +|Check the database and tell me the fund with the lowest Governance ESG score |Dynamic Industries, which has a score of 60| +# |Check the database and tell me the fund with the lowest ESG score |Dynamic Industries with a score of 50| +# |Check the database and tell me the largest fund |The largest fund is the Global Energy Fund, which has a size of 1,500| +# |Check the database and tell me which funds contain Shell |Funds containing Shell are European Growth Fund, Global Energy Fund, Silverman Global ETF and WhiteRock ETF| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| -# |How much did I spend at Tesco? |639.84| - -# |How much did I spend on Amazon? |You spent a total of £1586.56 on Amazon | -# |How much did I spend on Tesco compared to Amazon? |946.72 | - @web_agent Scenario Outline: When a user asks InferESG generic questions Given a prompt to InferESG @@ -50,5 +35,4 @@ Scenario Outline: Check Response's confidence Examples: |prompt | |What is the capital of France? | -|How much did I spend at Tesco? | diff --git a/backend/tests/BDD/step_defs/test_prompts.py b/backend/tests/BDD/step_defs/test_prompts.py index 26e49c4b..732356d2 100644 --- a/backend/tests/BDD/step_defs/test_prompts.py +++ b/backend/tests/BDD/step_defs/test_prompts.py @@ -1,31 +1,38 @@ from pytest_bdd import given, when, then, parsers, scenarios import pytest +import logging from tests.BDD.test_utilities import ( - send_prompt, + send_prompt, app_healthcheck, correctness_evaluator, - healthy_response, + healthy_response, check_response_confidence, ) from decimal import Decimal import decimal +logger = logging.getLogger(__name__) + scenarios("../features/Correctness/Accuracy_Factual_Correctness.feature") + @pytest.fixture def context(): return {} + @given(parsers.parse("a prompt to InferESG")) def prepare_prompt(context): healthcheck_response = app_healthcheck() assert healthcheck_response.status_code == 200 assert healthcheck_response.json() == healthy_response - context['health_check_passed'] = True + context["health_check_passed"] = True + @when(parsers.parse("I get the response")) def get_response(context): - assert context.get('health_check_passed', False) + assert context.get("health_check_passed", False) + @then(parsers.parse("the response to this '{prompt}' should match the '{expected_response}'")) def check_response_includes_expected_response(context, prompt, expected_response): @@ -36,44 +43,43 @@ def check_response_includes_expected_response(context, prompt, expected_response expected_value = Decimal(str(expected_response).strip()) actual_value = Decimal(str(actual_response).strip()) - tolerance = Decimal('0.01') + tolerance = Decimal("0.01") is_equal = abs(expected_value - actual_value) <= tolerance if not is_equal: - pytest.fail( - f"\nNumeric values don't match!\n" - f"Expected: {expected_value}\n" - f"Actual: {actual_value}" - ) + pytest.fail(f"\nNumeric values don't match!\n" f"Expected: {expected_value}\n" f"Actual: {actual_value}") except (ValueError, decimal.InvalidOperation): expected_str = str(expected_response).strip() actual_str = str(actual_response).strip() - - assert actual_str.find(expected_str) != -1, ( - f"\nTest failed!\n" - f"Expected: {expected_str}\n" - f"Actual: {actual_str}" + logger.info(f"Expected : {expected_str} \nActual: {actual_str}") + + if actual_str.find(expected_str) == -1: + result = correctness_evaluator.evaluate_strings( + input=prompt, + prediction=expected_str, + reference=actual_str, ) - # if expected_str != actual_str: - # result = correctness_evaluator.evaluate_strings( - # input=prompt, - # prediction=expected_str, - # reference=actual_str, - # ) + if result["value"] == "N": + logger.error( + f"\nTest failed!\n" + f"Expected: {expected_str}\n" + f"Actual: {actual_str}\n" + f"Reasoning: {result.get('reasoning', 'No reasoning provided')}" + ) + + assert result["value"] == "Y", ( + f"\nTest failed!\n" + f"Expected: {expected_str}\n" + f"Actual: {actual_str}\n" + f"Reasoning: {result.get('reasoning', 'No reasoning provided')}" + ) - # assert result["score"] == 1, ( - # f"\nTest failed!\n" - # f"Expected: {expected_str}\n" - # f"Actual: {actual_str}\n" - # f"Reasoning: {result.get('reasoning', 'No reasoning provided')}" - # ) @then(parsers.parse("the response to this '{prompt}' should give a confident answer")) def check_bot_response_confidence(prompt): response = send_prompt(prompt) result = check_response_confidence(prompt, response.json()) - assert result["score"] == 1, "The bot response is not confident enough. \nReasoning: " + result["reasoning"] - + assert result["score"] == 1, "The bot response is not confident enough. \nReasoning: " + result["reasoning"] diff --git a/backend/tests/BDD/test_utilities.py b/backend/tests/BDD/test_utilities.py index a10fffaa..5ae61eee 100644 --- a/backend/tests/BDD/test_utilities.py +++ b/backend/tests/BDD/test_utilities.py @@ -13,21 +13,23 @@ client = TestClient(app) config = Config() + def app_healthcheck(): healthcheck_response = client.get(HEALTHCHECK_ENDPOINT_URL) return healthcheck_response + def send_prompt(prompt: str): start_response = client.get(START_ENDPOINT_URL.format(utterance=prompt)) return start_response -#Evaluators -#Evaluation LLM -llm = ChatOpenAI(api_key=config.openai_key, model="gpt-4o-mini", temperature=0, max_retries=2) -#llm = ChatMistralAI(name="mistral-small-latest", model_name= "mistral-small-latest", temperature=0, max_retries=2,) +# Evaluators +# Evaluation LLM +llm = ChatOpenAI(api_key=config.openai_key, model="gpt-4o-mini", temperature=0, max_retries=2) # type: ignore -correctness_evaluator: StringEvaluator = load_evaluator( # type: ignore - EvaluatorType.LABELED_CRITERIA, criteria="correctness", llm=llm) +correctness_evaluator: StringEvaluator = load_evaluator( # type: ignore + EvaluatorType.LABELED_CRITERIA, criteria="correctness", llm=llm +) confidence_criterion = { "confidence": "Does the bot seem confident that it replied to the question and gave the correct answer?" @@ -37,6 +39,7 @@ def send_prompt(prompt: str): EvaluatorType.CRITERIA, criteria=confidence_criterion, llm=llm ) + def check_response_confidence(prompt: str, bot_response: str) -> dict[str, str]: """ Uses an LLM to check the confidence of the bot's response.\n @@ -44,4 +47,4 @@ def check_response_confidence(prompt: str, bot_response: str) -> dict[str, str]: return confidence_evaluator.evaluate_strings( input=prompt, prediction=bot_response, - ) \ No newline at end of file + ) diff --git a/compose.yml b/compose.yml index 95167d26..3654659b 100644 --- a/compose.yml +++ b/compose.yml @@ -47,11 +47,13 @@ services: container_name: redis restart: always ports: - - '6379:6379' + - "6379:6379" + expose: + - "6379:6379" # needed for pytest-bdd tests networks: - network healthcheck: - test: ['CMD-SHELL', 'redis-cli ping | grep PONG'] + test: ["CMD-SHELL", "redis-cli ping | grep PONG"] interval: 60s timeout: 10s retries: 5 @@ -70,6 +72,7 @@ services: NEO4J_URI: bolt://neo4j-db:7687 NEO4J_USERNAME: ${NEO4J_USERNAME} NEO4J_PASSWORD: ${NEO4J_PASSWORD} + REDIS_HOST: redis MISTRAL_KEY: ${MISTRAL_KEY} OPENAI_KEY: ${OPENAI_KEY} FRONTEND_URL: ${FRONTEND_URL}