clean up prompt tests format

ASSERT-KTH · Nov 16, 2024 · 602435e · 602435e
1 parent 57b2c05
commit 602435e
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 8 deletions.
diff --git a/elleelleaime/core/benchmarks/runbugrun/runbugrun.py b/elleelleaime/core/benchmarks/runbugrun/runbugrun.py
@@ -86,12 +86,15 @@ def get_failing_tests(self, buggy_file, errors, test_rows):
         with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
             futures = []
             futures_to_tests = {}
-
+                
             for test_id, (test_input, test_output) in test_rows.iterrows():
+                test_input = test_input.strip()
+                test_output = test_output.strip()
+
                 if isinstance(errors, list):
                     result = errors[0]['exception'] + '\n' + errors[0]['output']
-                    cause = f"""Function with input {test_input.replace('"', "'")} failed with error: {result}""" 
-                    failing_tests[f"""{test_input} -> {test_output}"""] = cause
+                    cause = f"""Function with input: \n{test_input} \nexpected to output: \n{test_output} \nfailed with error: \n{result.strip()}""" 
+                    failing_tests[f"""test_{test_id}"""] = cause
                 else: # if there isn't a runtime exception, need to execute to get the cause of test failure
                     return failing_tests
                     # TODO: checkout first?
@@ -102,11 +105,11 @@ def get_failing_tests(self, buggy_file, errors, test_rows):
                 returncode, result = future.result()
                 test_input, test_output = futures_to_tests[future]
                 if returncode:
-                    cause = f"""Function with input {test_input.replace('"', "'")} failed with error: {result}""" 
-                    failing_tests[f"""{test_input} -> {test_output}"""] = cause
+                    cause = f"""Function with input: \n{test_input} \nexpected to output: \n{test_output} \nfailed with error: \n{result.strip()}""" 
+                    failing_tests[f"""test_{test_id}"""] = cause
                 elif result != test_output.strip():
-                    cause = f"""Expected function with input {test_input.replace('"', "'")} to output {test_output.replace('"', "'").replace("'", r"\'")} but got {result}"""
-                    failing_tests[f"""{test_input} -> {test_output}"""] = cause
+                    cause = f"""Function with input: \n{test_input} \nexpected to output: \n{test_output} \nbut got: \n{result}"""
+                    failing_tests[f"""test_{test_id}"""] = cause
                 else:
                     continue
 

diff --git a/elleelleaime/sample/strategies/instruct_python.py b/elleelleaime/sample/strategies/instruct_python.py
@@ -1,5 +1,6 @@
 from typing import Optional, Tuple
 from unidiff import PatchSet
+import re
 
 from elleelleaime.sample.strategy import PromptingStrategy
 from elleelleaime.core.benchmarks.bug import RichBug
@@ -38,9 +39,11 @@ def instruct(
 
         failing_tests_string = ""
         for test_case, cause in failing_test_causes.items():
+            expected = re.search('expected to output: \n(.*)\n(?:failed|but got)', cause)
+            expected = f"\"{expected.group(1)}\"" if expected else 'N/A'
             failing_tests_string += f"""Test `{test_case}`:
 ```python
-assert result == {test_case.split(' -> ')[-1]}
+assert result == {expected}
 ```
 Test `{test_case}` error:
 ```