diff --git a/_sample_docs/super_long_pages.pdf b/_sample_docs/super_long_pages.pdf new file mode 100644 index 00000000..9fbb2890 Binary files /dev/null and b/_sample_docs/super_long_pages.pdf differ diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index dc94bc61..e1cc73e0 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -185,7 +185,21 @@ def test_integration_split_pdf_with_caching( if cache_dir: assert not Path(cache_dir).exists() +@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"]) +def test_long_pages_hi_res(filename): + req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters( + files=shared.Files(content=open(filename, "rb"), file_name=filename, ), + strategy=shared.Strategy.HI_RES, + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ), ) + + client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000") + response = client.general.partition(request=req) + assert response.status_code == 200 + assert len(response.elements) def test_integration_split_pdf_for_file_with_no_name(): """ diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index e21b145c..9161fcfd 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -317,6 +317,8 @@ def before_request( fallback_value=DEFAULT_CACHE_TMP_DATA_DIR, ) + pdf = self._clean_large_pages(pdf) + page_range_start, page_range_end = form_utils.get_page_range( form_data, key=PARTITION_FORM_PAGE_RANGE_KEY.replace("[]", ""), @@ -423,6 +425,52 @@ async def call_api_partial( return response + def _clean_large_pages(self, + pdf: PdfReader) -> PdfReader: + max_page_length = 4000 + any_page_over_maximum_length = False + for page in pdf.pages: + if page.mediabox.height >= max_page_length: + any_page_over_maximum_length = True + + # early exit if all pages are safely under the max page length + if not any_page_over_maximum_length: + return pdf + + w = PdfWriter() + page_nums = 0 + + map_of_pages_to_clean = {} + for page in pdf.pages: + if page.mediabox.height <= max_page_length: + page_nums += 1 + w.add_page(page) + continue + + num_pages_to_add = math.ceil(page.mediabox.height / max_page_length) + + page_start = page.mediabox.height + page_end = (page_start - max_page_length) + for i in range(num_pages_to_add): + page_nums += 1 + map_of_pages_to_clean[page_nums] = {"top": page_start, "bottom": page_end} + w.add_page(page) + page_start = page_end + page_end = (page_start - max_page_length) + + page_nums = 0 + for page in w.pages: + page_nums += 1 + if map_of_pages_to_clean.get(page_nums) is None: + continue + page.mediabox.top = map_of_pages_to_clean.get(page_nums)["top"] + page.mediabox.bottom = map_of_pages_to_clean.get(page_nums)["bottom"] + + chunk_buffer = io.BytesIO() + w.write(chunk_buffer) + chunk_buffer.seek(0) + return PdfReader(chunk_buffer) + def _get_pdf_chunks_in_memory( self, pdf: PdfReader,