Skip to content

Commit

Permalink
Fix redis usage for tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mic-smith committed Oct 30, 2024
1 parent ceabc98 commit 4acaef3
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 62 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/test-backend.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
name: Test Backend
on:
on:
workflow_dispatch:
pull_request:
push:
branches:
Expand Down Expand Up @@ -33,4 +34,5 @@ jobs:
emoji: true
verbose: true
job-summary: true
custom-arguments: '--ignore=tests/BDD'
report-title: 'Backend Test Report'
2 changes: 1 addition & 1 deletion backend/src/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
default_frontend_url = "http://localhost:8650"
default_neo4j_uri = "bolt://localhost:7687"
default_files_directory = "files"
default_redis_host = "redis"
default_redis_host = "localhost"
default_redis_cache_duration = 3600


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,14 @@ Scenario Outline: When a user asks InferESG for information about their transact
Then the response to this '<prompt>' should match the '<expected_response>'
Examples:
|prompt |expected_response |
# |How much did I spend at Tesco? |639.84|
|Check the database and tell me the fund with the highest ESG social score |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67.|
|Check the database and tell me the fund with the highest ESG social score |Highest ESG score is Dynamic Industries with a score of 91|
|Check the database and tell me the average ESG score (Environmental) for the WhiteRock ETF fund |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67|
|Check the database and tell me the fund with the highest ESG social score |Dynamic Industries with a score of 91|
|Check the database and tell me the fund with the lowest Governance ESG score |Dynamic Industries, which has a score of 60|
# |Check the database and tell me the fund with the lowest ESG score |Dynamic Industries with a score of 50|
# |Check the database and tell me the largest fund |The largest fund is the Global Energy Fund, which has a size of 1,500|
# |Check the database and tell me which funds contain Shell |Funds containing Shell are European Growth Fund, Global Energy Fund, Silverman Global ETF and WhiteRock ETF|


# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|
# |How much did I spend at Tesco? |639.84|

# |How much did I spend on Amazon? |You spent a total of £1586.56 on Amazon |
# |How much did I spend on Tesco compared to Amazon? |946.72 |

@web_agent
Scenario Outline: When a user asks InferESG generic questions
Given a prompt to InferESG
Expand All @@ -50,5 +35,4 @@ Scenario Outline: Check Response's confidence
Examples:
|prompt |
|What is the capital of France? |
|How much did I spend at Tesco? |

64 changes: 35 additions & 29 deletions backend/tests/BDD/step_defs/test_prompts.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,38 @@
from pytest_bdd import given, when, then, parsers, scenarios
import pytest
import logging
from tests.BDD.test_utilities import (
send_prompt,
send_prompt,
app_healthcheck,
correctness_evaluator,
healthy_response,
healthy_response,
check_response_confidence,
)
from decimal import Decimal
import decimal

logger = logging.getLogger(__name__)

scenarios("../features/Correctness/Accuracy_Factual_Correctness.feature")


@pytest.fixture
def context():
return {}


@given(parsers.parse("a prompt to InferESG"))
def prepare_prompt(context):
healthcheck_response = app_healthcheck()
assert healthcheck_response.status_code == 200
assert healthcheck_response.json() == healthy_response
context['health_check_passed'] = True
context["health_check_passed"] = True


@when(parsers.parse("I get the response"))
def get_response(context):
assert context.get('health_check_passed', False)
assert context.get("health_check_passed", False)


@then(parsers.parse("the response to this '{prompt}' should match the '{expected_response}'"))
def check_response_includes_expected_response(context, prompt, expected_response):
Expand All @@ -36,44 +43,43 @@ def check_response_includes_expected_response(context, prompt, expected_response
expected_value = Decimal(str(expected_response).strip())
actual_value = Decimal(str(actual_response).strip())

tolerance = Decimal('0.01')
tolerance = Decimal("0.01")
is_equal = abs(expected_value - actual_value) <= tolerance

if not is_equal:
pytest.fail(
f"\nNumeric values don't match!\n"
f"Expected: {expected_value}\n"
f"Actual: {actual_value}"
)
pytest.fail(f"\nNumeric values don't match!\n" f"Expected: {expected_value}\n" f"Actual: {actual_value}")

except (ValueError, decimal.InvalidOperation):
expected_str = str(expected_response).strip()
actual_str = str(actual_response).strip()


assert actual_str.find(expected_str) != -1, (
f"\nTest failed!\n"
f"Expected: {expected_str}\n"
f"Actual: {actual_str}"
logger.info(f"Expected : {expected_str} \nActual: {actual_str}")

if actual_str.find(expected_str) == -1:
result = correctness_evaluator.evaluate_strings(
input=prompt,
prediction=expected_str,
reference=actual_str,
)

# if expected_str != actual_str:
# result = correctness_evaluator.evaluate_strings(
# input=prompt,
# prediction=expected_str,
# reference=actual_str,
# )
if result["value"] == "N":
logger.error(
f"\nTest failed!\n"
f"Expected: {expected_str}\n"
f"Actual: {actual_str}\n"
f"Reasoning: {result.get('reasoning', 'No reasoning provided')}"
)

assert result["value"] == "Y", (
f"\nTest failed!\n"
f"Expected: {expected_str}\n"
f"Actual: {actual_str}\n"
f"Reasoning: {result.get('reasoning', 'No reasoning provided')}"
)

# assert result["score"] == 1, (
# f"\nTest failed!\n"
# f"Expected: {expected_str}\n"
# f"Actual: {actual_str}\n"
# f"Reasoning: {result.get('reasoning', 'No reasoning provided')}"
# )

@then(parsers.parse("the response to this '{prompt}' should give a confident answer"))
def check_bot_response_confidence(prompt):
response = send_prompt(prompt)
result = check_response_confidence(prompt, response.json())
assert result["score"] == 1, "The bot response is not confident enough. \nReasoning: " + result["reasoning"]

assert result["score"] == 1, "The bot response is not confident enough. \nReasoning: " + result["reasoning"]
17 changes: 10 additions & 7 deletions backend/tests/BDD/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,23 @@
client = TestClient(app)
config = Config()


def app_healthcheck():
healthcheck_response = client.get(HEALTHCHECK_ENDPOINT_URL)
return healthcheck_response


def send_prompt(prompt: str):
start_response = client.get(START_ENDPOINT_URL.format(utterance=prompt))
return start_response

#Evaluators
#Evaluation LLM
llm = ChatOpenAI(api_key=config.openai_key, model="gpt-4o-mini", temperature=0, max_retries=2)
#llm = ChatMistralAI(name="mistral-small-latest", model_name= "mistral-small-latest", temperature=0, max_retries=2,)
# Evaluators
# Evaluation LLM
llm = ChatOpenAI(api_key=config.openai_key, model="gpt-4o-mini", temperature=0, max_retries=2) # type: ignore

correctness_evaluator: StringEvaluator = load_evaluator( # type: ignore
EvaluatorType.LABELED_CRITERIA, criteria="correctness", llm=llm)
correctness_evaluator: StringEvaluator = load_evaluator( # type: ignore
EvaluatorType.LABELED_CRITERIA, criteria="correctness", llm=llm
)

confidence_criterion = {
"confidence": "Does the bot seem confident that it replied to the question and gave the correct answer?"
Expand All @@ -37,11 +39,12 @@ def send_prompt(prompt: str):
EvaluatorType.CRITERIA, criteria=confidence_criterion, llm=llm
)


def check_response_confidence(prompt: str, bot_response: str) -> dict[str, str]:
"""
Uses an LLM to check the confidence of the bot's response.\n
Returns a dictionary with the binary score (pass = 1, fail = 0) and reasoning (text format)."""
return confidence_evaluator.evaluate_strings(
input=prompt,
prediction=bot_response,
)
)
7 changes: 5 additions & 2 deletions compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,13 @@ services:
container_name: redis
restart: always
ports:
- '6379:6379'
- "6379:6379"
expose:
- "6379:6379" # needed for pytest-bdd tests
networks:
- network
healthcheck:
test: ['CMD-SHELL', 'redis-cli ping | grep PONG']
test: ["CMD-SHELL", "redis-cli ping | grep PONG"]
interval: 60s
timeout: 10s
retries: 5
Expand All @@ -70,6 +72,7 @@ services:
NEO4J_URI: bolt://neo4j-db:7687
NEO4J_USERNAME: ${NEO4J_USERNAME}
NEO4J_PASSWORD: ${NEO4J_PASSWORD}
REDIS_HOST: redis
MISTRAL_KEY: ${MISTRAL_KEY}
OPENAI_KEY: ${OPENAI_KEY}
FRONTEND_URL: ${FRONTEND_URL}
Expand Down

0 comments on commit 4acaef3

Please sign in to comment.