From 3e5a020ca20c980d83f460e46743cd0470732ac3 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Fri, 15 Nov 2024 11:30:00 +0000 Subject: [PATCH] fix tests --- tests/test_document_builder.py | 4 ++++ tests/test_ocr_pipeline.py | 10 ++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/test_document_builder.py b/tests/test_document_builder.py index 085a1bdb..1cf41145 100644 --- a/tests/test_document_builder.py +++ b/tests/test_document_builder.py @@ -8,8 +8,10 @@ def test_document_builder(pdf_document): first_block = first_page.get_block(first_page.structure[0]) assert first_block.block_type == 'Section-header' assert first_block.text_extraction_method == 'pdftext' + first_text_block: Line = first_page.get_block(first_block.structure[0]) assert first_text_block.block_type == 'Line' + first_span = first_page.get_block(first_text_block.structure[0]) assert first_span.block_type == 'Span' assert first_span.text == 'Subspace Adversarial Training' @@ -18,8 +20,10 @@ def test_document_builder(pdf_document): last_block = first_page.get_block(first_page.structure[-1]) assert last_block.block_type == 'Text-inline-math' + last_text_block: Line = first_page.get_block(last_block.structure[-1]) assert last_text_block.block_type == 'Line' + last_span = first_page.get_block(last_text_block.structure[-1]) assert last_span.block_type == 'Span' assert last_span.text == 'prove the quality of single-step AT solutions. However,' diff --git a/tests/test_ocr_pipeline.py b/tests/test_ocr_pipeline.py index 5b2d756b..3cc8b5ed 100644 --- a/tests/test_ocr_pipeline.py +++ b/tests/test_ocr_pipeline.py @@ -2,11 +2,11 @@ from tests.utils import setup_pdf_document -def test_document_builder(): +def test_ocr_pipeline(): pdf_document = setup_pdf_document( "adversarial.pdf", document_builder_config={ - "force_ocr": False + "force_ocr": True } ) @@ -16,12 +16,14 @@ def test_document_builder(): first_block = first_page.get_block(first_page.structure[0]) assert first_block.text_extraction_method == 'surya' assert first_block.block_type == 'Section-header' + first_text_block: Line = first_page.get_block(first_block.structure[0]) assert first_text_block.block_type == 'Line' + first_span = first_page.get_block(first_text_block.structure[0]) assert first_span.block_type == 'Span' - assert first_span.text == 'Subspace Adversarial Training' + assert first_span.text.strip() == 'Subspace Adversarial Training' if __name__ == "__main__": - test_document_builder() + test_ocr_pipeline()