From 070ba9c7c8b443dd7f0586145113f2b947bfc9bc Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 28 Sep 2024 02:17:49 +0000 Subject: [PATCH] add image and pptx tests --- any_parser/any_parser.py | 4 +- examples/image_to_markdown.ipynb | 46 ++++++++++++---- examples/pdf_to_markdown.ipynb | 2 +- tests/outputs/correct_png_output.txt | 10 ++++ tests/outputs/correct_pptx_output.txt | 16 ++++++ tests/test.py | 76 +++++++++++++++++++++++++-- 6 files changed, 138 insertions(+), 16 deletions(-) create mode 100644 tests/outputs/correct_png_output.txt create mode 100644 tests/outputs/correct_pptx_output.txt diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index 1dfcffd..5ae5adb 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -66,7 +66,7 @@ def extract( if file_extension not in SUPPORTED_FILE_EXTENSIONS: supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS) return ( - f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}", + f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}.", None, ) @@ -128,7 +128,7 @@ def async_extract(self, file_path: str, extract_args: Optional[Dict] = None) -> # Check for valid file extension if file_extension not in SUPPORTED_FILE_EXTENSIONS: supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS) - return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}" + return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}." file_name = Path(file_path).name # Create the JSON payload diff --git a/examples/image_to_markdown.ipynb b/examples/image_to_markdown.ipynb index 929885d..19121ab 100644 --- a/examples/image_to_markdown.ipynb +++ b/examples/image_to_markdown.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -105,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -115,7 +115,7 @@ "" ] }, - "execution_count": 25, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -136,15 +136,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/markdown": [ + "| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |\n", + "|---|---|---|---|---|---|\n", + "| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |\n", + "| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |\n", + "| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |\n", + "| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |\n", + "| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |\n", + "| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |\n", + "\n", + "Growth rates include non-GAAP CC growth (GAAP % / CC %)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time Elapsed: 2.61 seconds\n" + ] + } + ], "source": [ "ap = AnyParser(example_apikey)\n", "\n", "# extract returns a tuple containing the markdown as a string and total time\n", "markdown_string, total_time = ap.extract(example_local_file)\n", - "\n", "display(Markdown(markdown_string))\n", "print(total_time)\n" ] @@ -179,7 +207,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/examples/pdf_to_markdown.ipynb b/examples/pdf_to_markdown.ipynb index 940db0b..7cf4748 100644 --- a/examples/pdf_to_markdown.ipynb +++ b/examples/pdf_to_markdown.ipynb @@ -356,7 +356,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/tests/outputs/correct_png_output.txt b/tests/outputs/correct_png_output.txt new file mode 100644 index 0000000..43be0fb --- /dev/null +++ b/tests/outputs/correct_png_output.txt @@ -0,0 +1,10 @@ +| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 | +|---|---|---|---|---|---| +| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% | +| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% | +| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% | +| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 | +| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% | +| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% | + +Growth rates include non-GAAP CC growth (GAAP % / CC %). \ No newline at end of file diff --git a/tests/outputs/correct_pptx_output.txt b/tests/outputs/correct_pptx_output.txt new file mode 100644 index 0000000..8dd5c22 --- /dev/null +++ b/tests/outputs/correct_pptx_output.txt @@ -0,0 +1,16 @@ +## Test finical report +## Title + +• Chart 1 example + +| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 | +|-----------------|---------|---------|---------|---------|---------| +| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% | +| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% | +| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% | +| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 | +| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% | +| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% | + +Growth rates include non-GAAP CC growth (GAAP % / CC %). +## Thanks \ No newline at end of file diff --git a/tests/test.py b/tests/test.py index d22a5a5..35fe3ff 100755 --- a/tests/test.py +++ b/tests/test.py @@ -39,7 +39,7 @@ def setUp(self): self.ap = AnyParser(self.api_key) def test_pdf_sync_extract(self): - """Synchronous Extraction""" + """Synchronous PDF Extraction""" working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" correct_output_file = "./tests/outputs/correct_pdf_output.txt" @@ -55,7 +55,7 @@ def test_pdf_sync_extract(self): self.assertIn("Time Elapsed", elapsed_time) def test_pdf_async_extract_and_fetch(self): - """Asynchronous Extraction and Fetch""" + """Asynchronous PDF Extraction and Fetch""" working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" correct_output_file = "./tests/outputs/correct_pdf_output.txt" @@ -73,7 +73,7 @@ def test_pdf_async_extract_and_fetch(self): ) def test_docx_sync_extract(self): - """Synchronous Extraction""" + """Synchronous Word Extraction""" working_file = "./examples/sample_data/test_odf.docx" correct_output_file = "./tests/outputs/correct_docx_output.txt" @@ -89,7 +89,7 @@ def test_docx_sync_extract(self): self.assertIn("Time Elapsed", elapsed_time) def test_docx_async_extract_and_fetch(self): - """Asynchronous Extraction and Fetch""" + """Asynchronous Word Extraction and Fetch""" working_file = "./examples/sample_data/test_odf.docx" correct_output_file = "./tests/outputs/correct_docx_output.txt" @@ -106,6 +106,74 @@ def test_docx_async_extract_and_fetch(self): percentage, 90, f"Output similarity too low: {percentage:.2f}%" ) + def test_pptx_sync_extract(self): + """Synchronous Powerpoint Extraction""" + working_file = "./examples/sample_data/test_odf.pptx" + correct_output_file = "./tests/outputs/correct_pptx_output.txt" + + # extract + markdown, elapsed_time = self.ap.extract(working_file) + self.assertFalse(markdown.startswith("Error:"), markdown) + correct_output = get_ground_truth(correct_output_file) + percentage = compare_markdown(markdown, correct_output) + + self.assertGreaterEqual( + percentage, 90, f"Output similarity too low: {percentage:.2f}%" + ) + self.assertIn("Time Elapsed", elapsed_time) + + def test_pptx_async_extract_and_fetch(self): + """Asynchronous Powerpoint Extraction and Fetch""" + working_file = "./examples/sample_data/test_odf.pptx" + correct_output_file = "./tests/outputs/correct_pptx_output.txt" + + # extract + file_id = self.ap.async_extract(working_file) + self.assertFalse(file_id.startswith("Error:"), file_id) + # fetch + markdown = self.ap.async_fetch(file_id=file_id) + self.assertFalse(markdown.startswith("Error:"), markdown) + correct_output = get_ground_truth(correct_output_file) + percentage = compare_markdown(markdown, correct_output) + + self.assertGreaterEqual( + percentage, 90, f"Output similarity too low: {percentage:.2f}%" + ) + + def test_image_sync_extract(self): + """Synchronous Image Extraction""" + working_file = "./examples/sample_data/test3.png" + correct_output_file = "./tests/outputs/correct_png_output.txt" + + # extract + markdown, elapsed_time = self.ap.extract(working_file) + self.assertFalse(markdown.startswith("Error:"), markdown) + correct_output = get_ground_truth(correct_output_file) + percentage = compare_markdown(markdown, correct_output) + + self.assertGreaterEqual( + percentage, 90, f"Output similarity too low: {percentage:.2f}%" + ) + self.assertIn("Time Elapsed", elapsed_time) + + def test_image_async_extract_and_fetch(self): + """Asynchronous Image Extraction and Fetch""" + working_file = "./examples/sample_data/test3.png" + correct_output_file = "./tests/outputs/correct_png_output.txt" + + # extract + file_id = self.ap.async_extract(working_file) + self.assertFalse(file_id.startswith("Error:"), file_id) + # fetch + markdown = self.ap.async_fetch(file_id=file_id) + self.assertFalse(markdown.startswith("Error:"), markdown) + correct_output = get_ground_truth(correct_output_file) + percentage = compare_markdown(markdown, correct_output) + + self.assertGreaterEqual( + percentage, 90, f"Output similarity too low: {percentage:.2f}%" + ) + if __name__ == "__main__": unittest.main()