Skip to content

Commit

Permalink
add image and pptx tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Sep 28, 2024
1 parent 1ee5535 commit 070ba9c
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 16 deletions.
4 changes: 2 additions & 2 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def extract(
if file_extension not in SUPPORTED_FILE_EXTENSIONS:
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
return (
f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}",
f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}.",
None,
)

Expand Down Expand Up @@ -128,7 +128,7 @@ def async_extract(self, file_path: str, extract_args: Optional[Dict] = None) ->
# Check for valid file extension
if file_extension not in SUPPORTED_FILE_EXTENSIONS:
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}"
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."

file_name = Path(file_path).name
# Create the JSON payload
Expand Down
46 changes: 37 additions & 9 deletions examples/image_to_markdown.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -55,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -84,7 +84,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -105,7 +105,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -115,7 +115,7 @@
"<IPython.core.display.Image object>"
]
},
"execution_count": 25,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -136,15 +136,43 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/markdown": [
"| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |\n",
"|---|---|---|---|---|---|\n",
"| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |\n",
"| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |\n",
"| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |\n",
"| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |\n",
"| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |\n",
"| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |\n",
"\n",
"Growth rates include non-GAAP CC growth (GAAP % / CC %)."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time Elapsed: 2.61 seconds\n"
]
}
],
"source": [
"ap = AnyParser(example_apikey)\n",
"\n",
"# extract returns a tuple containing the markdown as a string and total time\n",
"markdown_string, total_time = ap.extract(example_local_file)\n",
"\n",
"display(Markdown(markdown_string))\n",
"print(total_time)\n"
]
Expand Down Expand Up @@ -179,7 +207,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion examples/pdf_to_markdown.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
"version": "3.10.15"
}
},
"nbformat": 4,
Expand Down
10 changes: 10 additions & 0 deletions tests/outputs/correct_png_output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
|---|---|---|---|---|---|
| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |
| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |

Growth rates include non-GAAP CC growth (GAAP % / CC %).
16 changes: 16 additions & 0 deletions tests/outputs/correct_pptx_output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
## Test finical report
## Title

• Chart 1 example

| Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
|-----------------|---------|---------|---------|---------|---------|
| Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
| Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
| Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |
| Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
| Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
| LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |

Growth rates include non-GAAP CC growth (GAAP % / CC %).
## Thanks
76 changes: 72 additions & 4 deletions tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def setUp(self):
self.ap = AnyParser(self.api_key)

def test_pdf_sync_extract(self):
"""Synchronous Extraction"""
"""Synchronous PDF Extraction"""
working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
correct_output_file = "./tests/outputs/correct_pdf_output.txt"

Expand All @@ -55,7 +55,7 @@ def test_pdf_sync_extract(self):
self.assertIn("Time Elapsed", elapsed_time)

def test_pdf_async_extract_and_fetch(self):
"""Asynchronous Extraction and Fetch"""
"""Asynchronous PDF Extraction and Fetch"""
working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
correct_output_file = "./tests/outputs/correct_pdf_output.txt"

Expand All @@ -73,7 +73,7 @@ def test_pdf_async_extract_and_fetch(self):
)

def test_docx_sync_extract(self):
"""Synchronous Extraction"""
"""Synchronous Word Extraction"""
working_file = "./examples/sample_data/test_odf.docx"
correct_output_file = "./tests/outputs/correct_docx_output.txt"

Expand All @@ -89,7 +89,7 @@ def test_docx_sync_extract(self):
self.assertIn("Time Elapsed", elapsed_time)

def test_docx_async_extract_and_fetch(self):
"""Asynchronous Extraction and Fetch"""
"""Asynchronous Word Extraction and Fetch"""
working_file = "./examples/sample_data/test_odf.docx"
correct_output_file = "./tests/outputs/correct_docx_output.txt"

Expand All @@ -106,6 +106,74 @@ def test_docx_async_extract_and_fetch(self):
percentage, 90, f"Output similarity too low: {percentage:.2f}%"
)

def test_pptx_sync_extract(self):
"""Synchronous Powerpoint Extraction"""
working_file = "./examples/sample_data/test_odf.pptx"
correct_output_file = "./tests/outputs/correct_pptx_output.txt"

# extract
markdown, elapsed_time = self.ap.extract(working_file)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)

self.assertGreaterEqual(
percentage, 90, f"Output similarity too low: {percentage:.2f}%"
)
self.assertIn("Time Elapsed", elapsed_time)

def test_pptx_async_extract_and_fetch(self):
"""Asynchronous Powerpoint Extraction and Fetch"""
working_file = "./examples/sample_data/test_odf.pptx"
correct_output_file = "./tests/outputs/correct_pptx_output.txt"

# extract
file_id = self.ap.async_extract(working_file)
self.assertFalse(file_id.startswith("Error:"), file_id)
# fetch
markdown = self.ap.async_fetch(file_id=file_id)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)

self.assertGreaterEqual(
percentage, 90, f"Output similarity too low: {percentage:.2f}%"
)

def test_image_sync_extract(self):
"""Synchronous Image Extraction"""
working_file = "./examples/sample_data/test3.png"
correct_output_file = "./tests/outputs/correct_png_output.txt"

# extract
markdown, elapsed_time = self.ap.extract(working_file)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)

self.assertGreaterEqual(
percentage, 90, f"Output similarity too low: {percentage:.2f}%"
)
self.assertIn("Time Elapsed", elapsed_time)

def test_image_async_extract_and_fetch(self):
"""Asynchronous Image Extraction and Fetch"""
working_file = "./examples/sample_data/test3.png"
correct_output_file = "./tests/outputs/correct_png_output.txt"

# extract
file_id = self.ap.async_extract(working_file)
self.assertFalse(file_id.startswith("Error:"), file_id)
# fetch
markdown = self.ap.async_fetch(file_id=file_id)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)

self.assertGreaterEqual(
percentage, 90, f"Output similarity too low: {percentage:.2f}%"
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 070ba9c

Please sign in to comment.