diff --git a/.github/workflows/test-backend.yml b/.github/workflows/test-backend.yml index 84d82744..0b37731e 100644 --- a/.github/workflows/test-backend.yml +++ b/.github/workflows/test-backend.yml @@ -1,5 +1,6 @@ name: Test Backend -on: +on: + workflow_dispatch: pull_request: push: branches: @@ -33,4 +34,5 @@ jobs: emoji: true verbose: true job-summary: true + custom-arguments: '--ignore=backend/tests/BDD' report-title: 'Backend Test Report' diff --git a/backend/requirements.txt b/backend/requirements.txt index cd69944d..8bc08017 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -16,11 +16,14 @@ cffi==1.16.0 cryptography==42.0.7 isodate==0.6.1 pycparser==2.22 -openai==1.35.3 +openai==1.52.1 beautifulsoup4==4.12.3 aiohttp==3.9.5 googlesearch-python==1.2.4 matplotlib==3.9.1 +pytest-bdd==7.3.0 +langchain==0.3.4 +langchain-openai==0.2.3 pillow==10.4.0 pypdf==4.3.1 hiredis==3.0.0 diff --git a/backend/src/utils/config.py b/backend/src/utils/config.py index cbf1cdde..5f7e4b8b 100644 --- a/backend/src/utils/config.py +++ b/backend/src/utils/config.py @@ -4,7 +4,7 @@ default_frontend_url = "http://localhost:8650" default_neo4j_uri = "bolt://localhost:7687" default_files_directory = "files" -default_redis_host = "redis" +default_redis_host = "localhost" default_redis_cache_duration = 3600 diff --git a/backend/tests/BDD/__init__.py b/backend/tests/BDD/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature b/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature new file mode 100644 index 00000000..3bbb27e0 --- /dev/null +++ b/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature @@ -0,0 +1,38 @@ +@database_agent @ESG +Scenario Outline: When a user asks InferESG for information about their transaction history + Given a prompt to InferESG + When I get the response + Then the response to this '' should match the '' +Examples: +|prompt |expected_response | +|Check the database and tell me the average ESG score (Environmental) for the WhiteRock ETF fund |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67| +|Check the database and tell me the fund with the highest ESG social score |Dynamic Industries with a score of 91| +|Check the database and tell me the fund with the lowest Governance ESG score |Dynamic Industries, which has a score of 60| +# |Check the database and tell me the fund with the lowest ESG score |Dynamic Industries with a score of 50| +# |Check the database and tell me the largest fund |The largest fund is the Global Energy Fund, which has a size of 1,500| +# |Check the database and tell me which funds contain Shell |Funds containing Shell are European Growth Fund, Global Energy Fund, Silverman Global ETF and WhiteRock ETF| + + +@web_agent +Scenario Outline: When a user asks InferESG generic questions + Given a prompt to InferESG + When I get the response + Then the response to this '' should match the '' +Examples: +|prompt |expected_response | +|What is the capital of France? |Paris | +|What is the capital of Zimbabwe? |Harare | +|What is the capital of Spain? |Madrid | +|What is the capital of China? |Beijing | +|What is the capital of United Kingdom? |London | +|What is the capital of Sweden? |Stockholm | + +@confidence +Scenario Outline: Check Response's confidence + Given a prompt to InferESG + When I get the response + Then the response to this '' should give a confident answer +Examples: +|prompt | +|What is the capital of France? | + diff --git a/backend/tests/BDD/step_defs/__init__.py b/backend/tests/BDD/step_defs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/BDD/step_defs/test_prompts.py b/backend/tests/BDD/step_defs/test_prompts.py new file mode 100644 index 00000000..732356d2 --- /dev/null +++ b/backend/tests/BDD/step_defs/test_prompts.py @@ -0,0 +1,85 @@ +from pytest_bdd import given, when, then, parsers, scenarios +import pytest +import logging +from tests.BDD.test_utilities import ( + send_prompt, + app_healthcheck, + correctness_evaluator, + healthy_response, + check_response_confidence, +) +from decimal import Decimal +import decimal + +logger = logging.getLogger(__name__) + +scenarios("../features/Correctness/Accuracy_Factual_Correctness.feature") + + +@pytest.fixture +def context(): + return {} + + +@given(parsers.parse("a prompt to InferESG")) +def prepare_prompt(context): + healthcheck_response = app_healthcheck() + assert healthcheck_response.status_code == 200 + assert healthcheck_response.json() == healthy_response + context["health_check_passed"] = True + + +@when(parsers.parse("I get the response")) +def get_response(context): + assert context.get("health_check_passed", False) + + +@then(parsers.parse("the response to this '{prompt}' should match the '{expected_response}'")) +def check_response_includes_expected_response(context, prompt, expected_response): + response = send_prompt(prompt) + actual_response = response.json() + + try: + expected_value = Decimal(str(expected_response).strip()) + actual_value = Decimal(str(actual_response).strip()) + + tolerance = Decimal("0.01") + is_equal = abs(expected_value - actual_value) <= tolerance + + if not is_equal: + pytest.fail(f"\nNumeric values don't match!\n" f"Expected: {expected_value}\n" f"Actual: {actual_value}") + + except (ValueError, decimal.InvalidOperation): + expected_str = str(expected_response).strip() + actual_str = str(actual_response).strip() + + logger.info(f"Expected : {expected_str} \nActual: {actual_str}") + + if actual_str.find(expected_str) == -1: + result = correctness_evaluator.evaluate_strings( + input=prompt, + prediction=expected_str, + reference=actual_str, + ) + + if result["value"] == "N": + logger.error( + f"\nTest failed!\n" + f"Expected: {expected_str}\n" + f"Actual: {actual_str}\n" + f"Reasoning: {result.get('reasoning', 'No reasoning provided')}" + ) + + assert result["value"] == "Y", ( + f"\nTest failed!\n" + f"Expected: {expected_str}\n" + f"Actual: {actual_str}\n" + f"Reasoning: {result.get('reasoning', 'No reasoning provided')}" + ) + + +@then(parsers.parse("the response to this '{prompt}' should give a confident answer")) +def check_bot_response_confidence(prompt): + response = send_prompt(prompt) + result = check_response_confidence(prompt, response.json()) + assert result["score"] == 1, "The bot response is not confident enough. \nReasoning: " + result["reasoning"] diff --git a/backend/tests/BDD/test_utilities.py b/backend/tests/BDD/test_utilities.py new file mode 100644 index 00000000..5ae61eee --- /dev/null +++ b/backend/tests/BDD/test_utilities.py @@ -0,0 +1,50 @@ +from src.api import app +from src.utils import Config +from fastapi.testclient import TestClient +from langchain.evaluation import EvaluatorType, StringEvaluator, load_evaluator +from langchain_openai.chat_models import ChatOpenAI + +START_ENDPOINT_URL = "/chat?utterance={utterance}" +CONVERSATION_ENDPOINT_URL = "/chat?utterance={utterance}" +HEALTHCHECK_ENDPOINT_URL = "/health" +health_prefix = "InferESG healthcheck: " +healthy_response = health_prefix + "backend is healthy. Neo4J is healthy." + +client = TestClient(app) +config = Config() + + +def app_healthcheck(): + healthcheck_response = client.get(HEALTHCHECK_ENDPOINT_URL) + return healthcheck_response + + +def send_prompt(prompt: str): + start_response = client.get(START_ENDPOINT_URL.format(utterance=prompt)) + return start_response + +# Evaluators +# Evaluation LLM +llm = ChatOpenAI(api_key=config.openai_key, model="gpt-4o-mini", temperature=0, max_retries=2) # type: ignore + +correctness_evaluator: StringEvaluator = load_evaluator( # type: ignore + EvaluatorType.LABELED_CRITERIA, criteria="correctness", llm=llm +) + +confidence_criterion = { + "confidence": "Does the bot seem confident that it replied to the question and gave the correct answer?" +} + +confidence_evaluator: StringEvaluator = load_evaluator( # type: ignore + EvaluatorType.CRITERIA, criteria=confidence_criterion, llm=llm +) + + +def check_response_confidence(prompt: str, bot_response: str) -> dict[str, str]: + """ + Uses an LLM to check the confidence of the bot's response.\n + Returns a dictionary with the binary score (pass = 1, fail = 0) and reasoning (text format).""" + return confidence_evaluator.evaluate_strings( + input=prompt, + prediction=bot_response, + ) diff --git a/compose.yml b/compose.yml index 95167d26..3654659b 100644 --- a/compose.yml +++ b/compose.yml @@ -47,11 +47,13 @@ services: container_name: redis restart: always ports: - - '6379:6379' + - "6379:6379" + expose: + - "6379:6379" # needed for pytest-bdd tests networks: - network healthcheck: - test: ['CMD-SHELL', 'redis-cli ping | grep PONG'] + test: ["CMD-SHELL", "redis-cli ping | grep PONG"] interval: 60s timeout: 10s retries: 5 @@ -70,6 +72,7 @@ services: NEO4J_URI: bolt://neo4j-db:7687 NEO4J_USERNAME: ${NEO4J_USERNAME} NEO4J_PASSWORD: ${NEO4J_PASSWORD} + REDIS_HOST: redis MISTRAL_KEY: ${MISTRAL_KEY} OPENAI_KEY: ${OPENAI_KEY} FRONTEND_URL: ${FRONTEND_URL}