Fix redis usage for tests

ScottLogic · Oct 30, 2024 · 4acaef3 · 4acaef3
1 parent ceabc98
commit 4acaef3
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 62 deletions.
diff --git a/.github/workflows/test-backend.yml b/.github/workflows/test-backend.yml
@@ -1,5 +1,6 @@
 name: Test Backend
-on: 
+on:
+  workflow_dispatch:
   pull_request:
   push:
     branches:
@@ -33,4 +34,5 @@ jobs:
           emoji: true
           verbose: true
           job-summary: true
+          custom-arguments: '--ignore=tests/BDD'
           report-title: 'Backend Test Report'
diff --git a/backend/src/utils/config.py b/backend/src/utils/config.py
@@ -4,7 +4,7 @@
 default_frontend_url = "http://localhost:8650"
 default_neo4j_uri = "bolt://localhost:7687"
 default_files_directory = "files"
-default_redis_host = "redis"
+default_redis_host = "localhost"
 default_redis_cache_duration = 3600
 
 

diff --git a/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature b/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature
@@ -5,29 +5,14 @@ Scenario Outline: When a user asks InferESG for information about their transact
     Then   the response to this '<prompt>' should match the '<expected_response>'
 Examples:
 |prompt                                                                         |expected_response      |
-# |How much did I spend at Tesco?                                                 |639.84|
-|Check the database and tell me the fund with the highest ESG social score        |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67.|
-|Check the database and tell me the fund with the highest ESG social score        |Highest ESG score is Dynamic Industries with a score of 91|
+|Check the database and tell me the average ESG score (Environmental) for the WhiteRock ETF fund |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67|
+|Check the database and tell me the fund with the highest ESG social score        |Dynamic Industries with a score of 91|
+|Check the database and tell me the fund with the lowest Governance ESG score     |Dynamic Industries, which has a score of 60|
+# |Check the database and tell me the fund with the lowest ESG score                |Dynamic Industries with a score of 50|
+# |Check the database and tell me the largest fund                                  |The largest fund is the Global Energy Fund, which has a size of 1,500|
+# |Check the database and tell me which funds contain Shell                         |Funds containing Shell are European Growth Fund, Global Energy Fund, Silverman Global ETF and WhiteRock ETF|
 
 
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-# |How much did I spend at Tesco?                                                 |639.84|
-
-# |How much did I spend on Amazon?                                                |You spent a total of £1586.56 on Amazon    |
-# |How much did I spend on Tesco compared to Amazon?                              |946.72                 |
-
 @web_agent
 Scenario Outline: When a user asks InferESG generic questions
     Given  a prompt to InferESG
@@ -50,5 +35,4 @@ Scenario Outline: Check Response's confidence
 Examples:
 |prompt                                                                     |
 |What is the capital of France?                                             |
-|How much did I spend at Tesco?                                             |
 
diff --git a/backend/tests/BDD/step_defs/test_prompts.py b/backend/tests/BDD/step_defs/test_prompts.py
@@ -1,31 +1,38 @@
 from pytest_bdd import given, when, then, parsers, scenarios
 import pytest
+import logging
 from tests.BDD.test_utilities import (
-    send_prompt, 
+    send_prompt,
     app_healthcheck,
     correctness_evaluator,
-    healthy_response,   
+    healthy_response,
     check_response_confidence,
 )
 from decimal import Decimal
 import decimal
 
+logger = logging.getLogger(__name__)
+
 scenarios("../features/Correctness/Accuracy_Factual_Correctness.feature")
 
+
 @pytest.fixture
 def context():
     return {}
 
+
 @given(parsers.parse("a prompt to InferESG"))
 def prepare_prompt(context):
     healthcheck_response = app_healthcheck()
     assert healthcheck_response.status_code == 200
     assert healthcheck_response.json() == healthy_response
-    context['health_check_passed'] = True
+    context["health_check_passed"] = True
+
 
 @when(parsers.parse("I get the response"))
 def get_response(context):
-    assert context.get('health_check_passed', False)
+    assert context.get("health_check_passed", False)
+
 
 @then(parsers.parse("the response to this '{prompt}' should match the '{expected_response}'"))
 def check_response_includes_expected_response(context, prompt, expected_response):
@@ -36,44 +43,43 @@ def check_response_includes_expected_response(context, prompt, expected_response
         expected_value = Decimal(str(expected_response).strip())
         actual_value = Decimal(str(actual_response).strip())
 
-        tolerance = Decimal('0.01')
+        tolerance = Decimal("0.01")
         is_equal = abs(expected_value - actual_value) <= tolerance
 
         if not is_equal:
-            pytest.fail(
-                f"\nNumeric values don't match!\n"
-                f"Expected: {expected_value}\n"
-                f"Actual: {actual_value}"
-            )
+            pytest.fail(f"\nNumeric values don't match!\n" f"Expected: {expected_value}\n" f"Actual: {actual_value}")
 
     except (ValueError, decimal.InvalidOperation):
         expected_str = str(expected_response).strip()
         actual_str = str(actual_response).strip()
 
-
-        assert actual_str.find(expected_str) != -1, (
-                        f"\nTest failed!\n"
-                        f"Expected: {expected_str}\n"
-                        f"Actual: {actual_str}"
+        logger.info(f"Expected : {expected_str} \nActual: {actual_str}")
+
+        if actual_str.find(expected_str) == -1:
+            result = correctness_evaluator.evaluate_strings(
+                input=prompt,
+                prediction=expected_str,
+                reference=actual_str,
             )
 
-        # if expected_str != actual_str:
-        #     result = correctness_evaluator.evaluate_strings(
-        #         input=prompt,
-        #         prediction=expected_str,
-        #         reference=actual_str,
-        #     )
+            if result["value"] == "N":
+                logger.error(
+                    f"\nTest failed!\n"
+                    f"Expected: {expected_str}\n"
+                    f"Actual: {actual_str}\n"
+                    f"Reasoning: {result.get('reasoning', 'No reasoning provided')}"
+                )
+
+            assert result["value"] == "Y", (
+                f"\nTest failed!\n"
+                f"Expected: {expected_str}\n"
+                f"Actual: {actual_str}\n"
+                f"Reasoning: {result.get('reasoning', 'No reasoning provided')}"
+            )
 
-        #     assert result["score"] == 1, (
-        #         f"\nTest failed!\n"
-        #         f"Expected: {expected_str}\n"
-        #         f"Actual: {actual_str}\n"
-        #         f"Reasoning: {result.get('reasoning', 'No reasoning provided')}"
-        #     )
 
 @then(parsers.parse("the response to this '{prompt}' should give a confident answer"))
 def check_bot_response_confidence(prompt):
     response = send_prompt(prompt)
     result = check_response_confidence(prompt, response.json())
-    assert result["score"] == 1, "The bot response is not confident enough. \nReasoning: " + result["reasoning"]                
-
+    assert result["score"] == 1, "The bot response is not confident enough. \nReasoning: " + result["reasoning"]
diff --git a/backend/tests/BDD/test_utilities.py b/backend/tests/BDD/test_utilities.py
@@ -13,21 +13,23 @@
 client = TestClient(app)
 config = Config()
 
+
 def app_healthcheck():
     healthcheck_response = client.get(HEALTHCHECK_ENDPOINT_URL)
     return healthcheck_response
 
+
 def send_prompt(prompt: str):
     start_response = client.get(START_ENDPOINT_URL.format(utterance=prompt))
     return start_response
 
-#Evaluators
-#Evaluation LLM
-llm = ChatOpenAI(api_key=config.openai_key, model="gpt-4o-mini", temperature=0, max_retries=2)
-#llm = ChatMistralAI(name="mistral-small-latest", model_name= "mistral-small-latest", temperature=0, max_retries=2,) 
+# Evaluators
+# Evaluation LLM
+llm = ChatOpenAI(api_key=config.openai_key, model="gpt-4o-mini", temperature=0, max_retries=2) # type: ignore
 
-correctness_evaluator: StringEvaluator = load_evaluator( # type: ignore
-    EvaluatorType.LABELED_CRITERIA, criteria="correctness", llm=llm)
+correctness_evaluator: StringEvaluator = load_evaluator(  # type: ignore
+    EvaluatorType.LABELED_CRITERIA, criteria="correctness", llm=llm
+)
 
 confidence_criterion = {
     "confidence": "Does the bot seem confident that it replied to the question and gave the correct answer?"
@@ -37,11 +39,12 @@ def send_prompt(prompt: str):
     EvaluatorType.CRITERIA, criteria=confidence_criterion, llm=llm
 )
 
+
 def check_response_confidence(prompt: str, bot_response: str) -> dict[str, str]:
     """
     Uses an LLM to check the confidence of the bot's response.\n
     Returns a dictionary with the binary score (pass = 1, fail = 0) and reasoning (text format)."""
     return confidence_evaluator.evaluate_strings(
         input=prompt,
         prediction=bot_response,
-    )
+    )
diff --git a/compose.yml b/compose.yml
@@ -47,11 +47,13 @@ services:
     container_name: redis
     restart: always
     ports:
-      - '6379:6379'
+      - "6379:6379"
+    expose:
+      - "6379:6379" # needed for pytest-bdd tests
     networks:
       - network
     healthcheck:
-      test: ['CMD-SHELL', 'redis-cli ping | grep PONG']
+      test: ["CMD-SHELL", "redis-cli ping | grep PONG"]
       interval: 60s
       timeout: 10s
       retries: 5
@@ -70,6 +72,7 @@ services:
       NEO4J_URI: bolt://neo4j-db:7687
       NEO4J_USERNAME: ${NEO4J_USERNAME}
       NEO4J_PASSWORD: ${NEO4J_PASSWORD}
+      REDIS_HOST: redis
       MISTRAL_KEY: ${MISTRAL_KEY}
       OPENAI_KEY: ${OPENAI_KEY}
       FRONTEND_URL: ${FRONTEND_URL}