-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fs 80/use https for webagent search #21
Changes from all commits
236aa5e
08d6c37
e91b72f
d0981c1
d0c532a
1bfb87a
af12225
d402397
dc319ac
ab09932
f1c9382
319290c
c4016df
b41e90a
e0d306c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,44 +39,48 @@ def check_response_includes_expected_response(context, prompt, expected_response | |
response = send_prompt(prompt) | ||
actual_response = response.json() | ||
|
||
try: | ||
expected_value = Decimal(str(expected_response).strip()) | ||
actual_value = Decimal(str(actual_response).strip()) | ||
|
||
tolerance = Decimal("0.01") | ||
is_equal = abs(expected_value - actual_value) <= tolerance | ||
|
||
if not is_equal: | ||
pytest.fail(f"\nNumeric values don't match!\n" f"Expected: {expected_value}\n" f"Actual: {actual_value}") | ||
|
||
except (ValueError, decimal.InvalidOperation): | ||
expected_str = str(expected_response).strip() | ||
actual_str = str(actual_response).strip() | ||
|
||
logger.info(f"Expected : {expected_str} \nActual: {actual_str}") | ||
|
||
if actual_str.find(expected_str) == -1: | ||
result = correctness_evaluator.evaluate_strings( | ||
input=prompt, | ||
prediction=expected_str, | ||
reference=actual_str, | ||
) | ||
|
||
if result["value"] == "N": | ||
logger.error( | ||
f"\nTest failed!\n" | ||
f"Expected: {expected_str}\n" | ||
f"Actual: {actual_str}\n" | ||
f"Reasoning: {result.get('reasoning', 'No reasoning provided')}" | ||
) | ||
|
||
assert result["value"] == "Y", ( | ||
# Allow `expected_response` to be a list of possible valid responses | ||
possible_responses = [resp.strip() for resp in expected_response.split(",")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's good that this now support multiple possible answers. This would pass for any answer that contains one of those fund names or which contains There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mic-smith - I get your point, so what do you suggest we should do? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we could repeat the whole string in each option e.g. We're also commenting out these datasource based tests in #23 . So, I wonder if it's worth pulling these test changes into in a separate branch so it doesn't block this PR and then addressing once we have the new flow to upload data and a different data set available? |
||
|
||
match_found = False | ||
for expected_resp in possible_responses: | ||
try: | ||
expected_value = Decimal(expected_resp) | ||
actual_value = Decimal(str(actual_response).strip()) | ||
|
||
tolerance = Decimal("0.01") | ||
if abs(expected_value - actual_value) <= tolerance: | ||
match_found = True | ||
break # Exit loop if a match is found | ||
|
||
except (ValueError, decimal.InvalidOperation): | ||
if expected_resp in str(actual_response).strip(): | ||
match_found = True | ||
break | ||
|
||
if not match_found: | ||
# Fallback to the correctness evaluator if none of the options matched | ||
result = correctness_evaluator.evaluate_strings( | ||
input=prompt, | ||
prediction=expected_response, | ||
reference=actual_response, | ||
) | ||
|
||
if result["value"] == "N": | ||
logger.error( | ||
f"\nTest failed!\n" | ||
f"Expected: {expected_str}\n" | ||
f"Actual: {actual_str}\n" | ||
f"Expected one of: {possible_responses}\n" | ||
f"Actual: {actual_response}\n" | ||
f"Reasoning: {result.get('reasoning', 'No reasoning provided')}" | ||
) | ||
|
||
assert result["value"] == "Y", ( | ||
f"\nTest failed!\n" | ||
f"Expected one of: {possible_responses}\n" | ||
f"Actual: {actual_response}\n" | ||
f"Reasoning: {result.get('reasoning', 'No reasoning provided')}" | ||
) | ||
|
||
|
||
@then(parsers.parse("the response to this '{prompt}' should give a confident answer")) | ||
def check_bot_response_confidence(prompt): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,11 @@ | ||
import pytest | ||
from unittest.mock import patch, AsyncMock | ||
import json | ||
from src.agents.web_agent import web_general_search_core | ||
from src.agents.web_agent import web_general_search_core, perform_scrape | ||
from src.utils.web_utils import search_urls | ||
|
||
@pytest.mark.asyncio | ||
@patch("src.agents.web_agent.answer_user_ques", new_callable=AsyncMock) | ||
@patch("src.agents.web_agent.answer_user_question", new_callable=AsyncMock) | ||
@patch("src.agents.web_agent.create_search_term", new_callable=AsyncMock) | ||
@patch("src.agents.web_agent.perform_search", new_callable=AsyncMock) | ||
@patch("src.agents.web_agent.perform_scrape", new_callable=AsyncMock) | ||
|
@@ -16,13 +17,13 @@ async def test_web_general_search_core( | |
mock_perform_scrape, | ||
mock_perform_search, | ||
mock_create_search_term, | ||
mock_answer_user_ques | ||
mock_answer_user_question | ||
): | ||
llm = AsyncMock() | ||
model = "mock_model" | ||
|
||
# Mocking answer_user_ques to return a valid answer | ||
mock_answer_user_ques.return_value = json.dumps({ | ||
# Mocking answer_user_question to return a valid answer | ||
mock_answer_user_question.return_value = json.dumps({ | ||
"status": "success", | ||
"response": json.dumps({"is_valid": True, "answer": "Example summary."}) | ||
}) | ||
|
@@ -70,7 +71,7 @@ async def test_web_general_search_core_invalid_summary( | |
): | ||
llm = AsyncMock() | ||
model = "mock_model" | ||
mock_perform_search.return_value = {"status": "success", "urls": ["http://example.com"]} | ||
mock_perform_search.return_value = {"status": "success", "urls": ["https://example.com"], "error": None} | ||
mock_perform_scrape.return_value = "Example scraped content." | ||
mock_perform_summarization.return_value = json.dumps({"summary": "Example invalid summary."}) | ||
mock_is_valid_answer.return_value = False | ||
|
@@ -81,3 +82,35 @@ async def test_web_general_search_core_invalid_summary( | |
} | ||
assert json.loads(result) == expected_response | ||
|
||
@pytest.mark.asyncio | ||
@patch("src.utils.web_utils.search") | ||
async def test_https_urls(mock_search): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
mock_search.return_value = [ | ||
"https://example.com", | ||
"http://nonsecure.com", | ||
"https://another-secure-site.com" | ||
] | ||
|
||
result = await search_urls("query", num_results=5) | ||
expected_result = { | ||
"status": "success", | ||
"urls": ["https://example.com", "https://another-secure-site.com"], | ||
"error": None | ||
} | ||
assert json.loads(result) == expected_result | ||
|
||
@pytest.mark.asyncio | ||
@patch("src.agents.web_agent.scrape_content", new_callable=AsyncMock) | ||
async def test_perform_scrape_http_url(mock_scrape_content): | ||
mock_scrape_content.return_value = json.dumps({"status": "success", "content": "Scraped content."}) | ||
|
||
result = await perform_scrape("http://nonsecure.com") | ||
assert result == "" | ||
|
||
@pytest.mark.asyncio | ||
@patch("src.agents.web_agent.scrape_content", new_callable=AsyncMock) | ||
async def test_perform_scrape_https_url(mock_scrape_content): | ||
mock_scrape_content.return_value = json.dumps({"status": "success", "content": "Scraped content."}) | ||
|
||
result = await perform_scrape("https://secure.com") | ||
assert result == "Scraped content." |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to test this by mocking out
search
library?