From 602435e710990796bfa6b8f7330f2d776957c572 Mon Sep 17 00:00:00 2001
From: %magics <cadddr@users.noreply.github.com>
Date: Sat, 16 Nov 2024 20:48:30 +0000
Subject: [PATCH] clean up prompt tests format

---
 .../core/benchmarks/runbugrun/runbugrun.py      | 17 ++++++++++-------
 .../sample/strategies/instruct_python.py        |  5 ++++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/elleelleaime/core/benchmarks/runbugrun/runbugrun.py b/elleelleaime/core/benchmarks/runbugrun/runbugrun.py
index 33417cde..c4c8080b 100644
--- a/elleelleaime/core/benchmarks/runbugrun/runbugrun.py
+++ b/elleelleaime/core/benchmarks/runbugrun/runbugrun.py
@@ -86,12 +86,15 @@ def get_failing_tests(self, buggy_file, errors, test_rows):
         with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
             futures = []
             futures_to_tests = {}
-
+                
             for test_id, (test_input, test_output) in test_rows.iterrows():
+                test_input = test_input.strip()
+                test_output = test_output.strip()
+
                 if isinstance(errors, list):
                     result = errors[0]['exception'] + '\n' + errors[0]['output']
-                    cause = f"""Function with input {test_input.replace('"', "'")} failed with error: {result}""" 
-                    failing_tests[f"""{test_input} -> {test_output}"""] = cause
+                    cause = f"""Function with input: \n{test_input} \nexpected to output: \n{test_output} \nfailed with error: \n{result.strip()}""" 
+                    failing_tests[f"""test_{test_id}"""] = cause
                 else: # if there isn't a runtime exception, need to execute to get the cause of test failure
                     return failing_tests
                     # TODO: checkout first?
@@ -102,11 +105,11 @@ def get_failing_tests(self, buggy_file, errors, test_rows):
                 returncode, result = future.result()
                 test_input, test_output = futures_to_tests[future]
                 if returncode:
-                    cause = f"""Function with input {test_input.replace('"', "'")} failed with error: {result}""" 
-                    failing_tests[f"""{test_input} -> {test_output}"""] = cause
+                    cause = f"""Function with input: \n{test_input} \nexpected to output: \n{test_output} \nfailed with error: \n{result.strip()}""" 
+                    failing_tests[f"""test_{test_id}"""] = cause
                 elif result != test_output.strip():
-                    cause = f"""Expected function with input {test_input.replace('"', "'")} to output {test_output.replace('"', "'").replace("'", r"\'")} but got {result}"""
-                    failing_tests[f"""{test_input} -> {test_output}"""] = cause
+                    cause = f"""Function with input: \n{test_input} \nexpected to output: \n{test_output} \nbut got: \n{result}"""
+                    failing_tests[f"""test_{test_id}"""] = cause
                 else:
                     continue
 
diff --git a/elleelleaime/sample/strategies/instruct_python.py b/elleelleaime/sample/strategies/instruct_python.py
index e801ccfb..6ae7b6df 100644
--- a/elleelleaime/sample/strategies/instruct_python.py
+++ b/elleelleaime/sample/strategies/instruct_python.py
@@ -1,5 +1,6 @@
 from typing import Optional, Tuple
 from unidiff import PatchSet
+import re
 
 from elleelleaime.sample.strategy import PromptingStrategy
 from elleelleaime.core.benchmarks.bug import RichBug
@@ -38,9 +39,11 @@ def instruct(
 
         failing_tests_string = ""
         for test_case, cause in failing_test_causes.items():
+            expected = re.search('expected to output: \n(.*)\n(?:failed|but got)', cause)
+            expected = f"\"{expected.group(1)}\"" if expected else 'N/A'
             failing_tests_string += f"""Test `{test_case}`:
 ```python
-assert result == {test_case.split(' -> ')[-1]}
+assert result == {expected}
 ```
 Test `{test_case}` error:
 ```