From 972cb8b497836a9c699b1cfd21f3d895998ffcf7 Mon Sep 17 00:00:00 2001 From: SeisSerenata Date: Tue, 19 Nov 2024 06:45:09 +0000 Subject: [PATCH] chore: update output schema for parse and extract_tables --- any_parser/any_parser.py | 9 +++------ tests/test.py | 30 ++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index d4e4116..a97f8f9 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -153,9 +153,7 @@ def parse( try: response_data = response.json() - result = "\n".join( - response_data["markdown"] - ) # Using direct extraction instead of extract_key + result = response_data["markdown"] return result, f"Time Elapsed: {info}" except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" @@ -213,7 +211,7 @@ def extract_tables( try: response_data = response.json() - result = "\n".join(response_data["markdown"]) + result = response_data["markdown"] return result, f"Time Elapsed: {info}" except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" @@ -438,8 +436,7 @@ def async_fetch( elif "pii_extraction" in result: return result["pii_extraction"] elif "markdown" in result: - markdown_list = result["markdown"] - return "\n".join(markdown_list) + return result["markdown"] return f"Error: Invalid response format\n {result}" if response.status_code == 202: return "" diff --git a/tests/test.py b/tests/test.py index f1992bc..fa28b79 100755 --- a/tests/test.py +++ b/tests/test.py @@ -52,7 +52,8 @@ def test_pdf_sync_parse(self): correct_output_file = "./tests/outputs/correct_pdf_output.txt" # extract - markdown, elapsed_time = self.ap.parse(file_path=working_file) + markdown_list, elapsed_time = self.ap.parse(file_path=working_file) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) @@ -73,9 +74,10 @@ def test_pdf_sync_parse_with_file_content(self): file_type = Path(working_file).suffix.lower().lstrip(".") # extract - markdown, elapsed_time = self.ap.parse( + markdown_list, elapsed_time = self.ap.parse( file_content=file_content, file_type=file_type ) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) @@ -95,7 +97,8 @@ def test_pdf_async_parse_and_fetch(self): file_id = self.ap.async_parse(file_path=working_file) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch - markdown = self.ap.async_fetch(file_id=file_id) + markdown_list = self.ap.async_fetch(file_id=file_id) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -117,7 +120,8 @@ def test_pdf_async_parse_and_fetch_with_file_content(self): file_id = self.ap.async_parse(file_content=file_content, file_type=file_type) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch - markdown = self.ap.async_fetch(file_id=file_id) + markdown_list = self.ap.async_fetch(file_id=file_id) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -132,7 +136,8 @@ def test_docx_sync_extract(self): correct_output_file = "./tests/outputs/correct_docx_output.txt" # extract - markdown, elapsed_time = self.ap.parse(file_path=working_file) + markdown_list, elapsed_time = self.ap.parse(file_path=working_file) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -151,7 +156,8 @@ def test_docx_async_parse_and_fetch(self): file_id = self.ap.async_parse(file_path=working_file) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch - markdown = self.ap.async_fetch(file_id=file_id) + markdown_list = self.ap.async_fetch(file_id=file_id) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -166,7 +172,8 @@ def test_pptx_sync_extract(self): correct_output_file = "./tests/outputs/correct_pptx_output.txt" # extract - markdown, elapsed_time = self.ap.parse(file_path=working_file) + markdown_list, elapsed_time = self.ap.parse(file_path=working_file) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -185,7 +192,8 @@ def test_pptx_async_parse_and_fetch(self): file_id = self.ap.async_parse(file_path=working_file) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch - markdown = self.ap.async_fetch(file_id=file_id) + markdown_list = self.ap.async_fetch(file_id=file_id) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -200,7 +208,8 @@ def test_image_sync_extract(self): correct_output_file = "./tests/outputs/correct_png_output.txt" # extract - markdown, elapsed_time = self.ap.parse(file_path=working_file) + markdown_list, elapsed_time = self.ap.parse(file_path=working_file) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -219,7 +228,8 @@ def test_image_async_parse_and_fetch(self): file_id = self.ap.async_parse(file_path=working_file) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch - markdown = self.ap.async_fetch(file_id=file_id) + markdown_list = self.ap.async_fetch(file_id=file_id) + markdown = "\n".join(markdown_list) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output)