From 070ba9c7c8b443dd7f0586145113f2b947bfc9bc Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-50-96.us-west-2.compute.internal>
Date: Sat, 28 Sep 2024 02:17:49 +0000
Subject: [PATCH] add image and pptx tests

---
 any_parser/any_parser.py              |  4 +-
 examples/image_to_markdown.ipynb      | 46 ++++++++++++----
 examples/pdf_to_markdown.ipynb        |  2 +-
 tests/outputs/correct_png_output.txt  | 10 ++++
 tests/outputs/correct_pptx_output.txt | 16 ++++++
 tests/test.py                         | 76 +++++++++++++++++++++++++--
 6 files changed, 138 insertions(+), 16 deletions(-)
 create mode 100644 tests/outputs/correct_png_output.txt
 create mode 100644 tests/outputs/correct_pptx_output.txt

diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py
index 1dfcffd..5ae5adb 100644
--- a/any_parser/any_parser.py
+++ b/any_parser/any_parser.py
@@ -66,7 +66,7 @@ def extract(
         if file_extension not in SUPPORTED_FILE_EXTENSIONS:
             supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
             return (
-                f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}",
+                f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}.",
                 None,
             )
 
@@ -128,7 +128,7 @@ def async_extract(self, file_path: str, extract_args: Optional[Dict] = None) ->
         # Check for valid file extension
         if file_extension not in SUPPORTED_FILE_EXTENSIONS:
             supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
-            return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}"
+            return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
 
         file_name = Path(file_path).name
         # Create the JSON payload
diff --git a/examples/image_to_markdown.ipynb b/examples/image_to_markdown.ipynb
index 929885d..19121ab 100644
--- a/examples/image_to_markdown.ipynb
+++ b/examples/image_to_markdown.ipynb
@@ -32,7 +32,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -55,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -84,7 +84,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -115,7 +115,7 @@
        "<IPython.core.display.Image object>"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -136,15 +136,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |\n",
+       "|---|---|---|---|---|---|\n",
+       "| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |\n",
+       "| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |\n",
+       "| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |\n",
+       "| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |\n",
+       "| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |\n",
+       "| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |\n",
+       "\n",
+       "Growth rates include non-GAAP CC growth (GAAP % / CC %)."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time Elapsed: 2.61 seconds\n"
+     ]
+    }
+   ],
    "source": [
     "ap = AnyParser(example_apikey)\n",
     "\n",
     "# extract returns a tuple containing the markdown as a string and total time\n",
     "markdown_string, total_time = ap.extract(example_local_file)\n",
-    "\n",
     "display(Markdown(markdown_string))\n",
     "print(total_time)\n"
    ]
@@ -179,7 +207,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.15"
   }
  },
  "nbformat": 4,
diff --git a/examples/pdf_to_markdown.ipynb b/examples/pdf_to_markdown.ipynb
index 940db0b..7cf4748 100644
--- a/examples/pdf_to_markdown.ipynb
+++ b/examples/pdf_to_markdown.ipynb
@@ -356,7 +356,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.15"
   }
  },
  "nbformat": 4,
diff --git a/tests/outputs/correct_png_output.txt b/tests/outputs/correct_png_output.txt
new file mode 100644
index 0000000..43be0fb
--- /dev/null
+++ b/tests/outputs/correct_png_output.txt
@@ -0,0 +1,10 @@
+| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
+|---|---|---|---|---|---|
+| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
+| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
+| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |
+| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
+| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
+| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |
+
+Growth rates include non-GAAP CC growth (GAAP % / CC %).
\ No newline at end of file
diff --git a/tests/outputs/correct_pptx_output.txt b/tests/outputs/correct_pptx_output.txt
new file mode 100644
index 0000000..8dd5c22
--- /dev/null
+++ b/tests/outputs/correct_pptx_output.txt
@@ -0,0 +1,16 @@
+## Test finical report
+## Title
+
+• Chart 1 example
+
+| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
+|-----------------|---------|---------|---------|---------|---------|
+| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
+| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
+| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |
+| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
+| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
+| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |
+
+Growth rates include non-GAAP CC growth (GAAP % / CC %).
+## Thanks
\ No newline at end of file
diff --git a/tests/test.py b/tests/test.py
index d22a5a5..35fe3ff 100755
--- a/tests/test.py
+++ b/tests/test.py
@@ -39,7 +39,7 @@ def setUp(self):
         self.ap = AnyParser(self.api_key)
 
     def test_pdf_sync_extract(self):
-        """Synchronous Extraction"""
+        """Synchronous PDF Extraction"""
         working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
         correct_output_file = "./tests/outputs/correct_pdf_output.txt"
 
@@ -55,7 +55,7 @@ def test_pdf_sync_extract(self):
         self.assertIn("Time Elapsed", elapsed_time)
 
     def test_pdf_async_extract_and_fetch(self):
-        """Asynchronous Extraction and Fetch"""
+        """Asynchronous PDF Extraction and Fetch"""
         working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
         correct_output_file = "./tests/outputs/correct_pdf_output.txt"
 
@@ -73,7 +73,7 @@ def test_pdf_async_extract_and_fetch(self):
         )
 
     def test_docx_sync_extract(self):
-        """Synchronous Extraction"""
+        """Synchronous Word Extraction"""
         working_file = "./examples/sample_data/test_odf.docx"
         correct_output_file = "./tests/outputs/correct_docx_output.txt"
 
@@ -89,7 +89,7 @@ def test_docx_sync_extract(self):
         self.assertIn("Time Elapsed", elapsed_time)
 
     def test_docx_async_extract_and_fetch(self):
-        """Asynchronous Extraction and Fetch"""
+        """Asynchronous Word Extraction and Fetch"""
         working_file = "./examples/sample_data/test_odf.docx"
         correct_output_file = "./tests/outputs/correct_docx_output.txt"
 
@@ -106,6 +106,74 @@ def test_docx_async_extract_and_fetch(self):
             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
         )
 
+    def test_pptx_sync_extract(self):
+        """Synchronous Powerpoint Extraction"""
+        working_file = "./examples/sample_data/test_odf.pptx"
+        correct_output_file = "./tests/outputs/correct_pptx_output.txt"
+
+        # extract
+        markdown, elapsed_time = self.ap.extract(working_file)
+        self.assertFalse(markdown.startswith("Error:"), markdown)
+        correct_output = get_ground_truth(correct_output_file)
+        percentage = compare_markdown(markdown, correct_output)
+
+        self.assertGreaterEqual(
+            percentage, 90, f"Output similarity too low: {percentage:.2f}%"
+        )
+        self.assertIn("Time Elapsed", elapsed_time)
+
+    def test_pptx_async_extract_and_fetch(self):
+        """Asynchronous Powerpoint Extraction and Fetch"""
+        working_file = "./examples/sample_data/test_odf.pptx"
+        correct_output_file = "./tests/outputs/correct_pptx_output.txt"
+
+        # extract
+        file_id = self.ap.async_extract(working_file)
+        self.assertFalse(file_id.startswith("Error:"), file_id)
+        # fetch
+        markdown = self.ap.async_fetch(file_id=file_id)
+        self.assertFalse(markdown.startswith("Error:"), markdown)
+        correct_output = get_ground_truth(correct_output_file)
+        percentage = compare_markdown(markdown, correct_output)
+
+        self.assertGreaterEqual(
+            percentage, 90, f"Output similarity too low: {percentage:.2f}%"
+        )
+
+    def test_image_sync_extract(self):
+        """Synchronous Image Extraction"""
+        working_file = "./examples/sample_data/test3.png"
+        correct_output_file = "./tests/outputs/correct_png_output.txt"
+
+        # extract
+        markdown, elapsed_time = self.ap.extract(working_file)
+        self.assertFalse(markdown.startswith("Error:"), markdown)
+        correct_output = get_ground_truth(correct_output_file)
+        percentage = compare_markdown(markdown, correct_output)
+
+        self.assertGreaterEqual(
+            percentage, 90, f"Output similarity too low: {percentage:.2f}%"
+        )
+        self.assertIn("Time Elapsed", elapsed_time)
+
+    def test_image_async_extract_and_fetch(self):
+        """Asynchronous Image Extraction and Fetch"""
+        working_file = "./examples/sample_data/test3.png"
+        correct_output_file = "./tests/outputs/correct_png_output.txt"
+
+        # extract
+        file_id = self.ap.async_extract(working_file)
+        self.assertFalse(file_id.startswith("Error:"), file_id)
+        # fetch
+        markdown = self.ap.async_fetch(file_id=file_id)
+        self.assertFalse(markdown.startswith("Error:"), markdown)
+        correct_output = get_ground_truth(correct_output_file)
+        percentage = compare_markdown(markdown, correct_output)
+
+        self.assertGreaterEqual(
+            percentage, 90, f"Output similarity too low: {percentage:.2f}%"
+        )
+
 
 if __name__ == "__main__":
     unittest.main()