Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding logic to trim pages that are too large to process #211

Merged
merged 1 commit into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added _sample_docs/super_long_pages.pdf
Binary file not shown.
14 changes: 14 additions & 0 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,21 @@ def test_integration_split_pdf_with_caching(
if cache_dir:
assert not Path(cache_dir).exists()

@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"])
def test_long_pages_hi_res(filename):
req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters(
files=shared.Files(content=open(filename, "rb"), file_name=filename, ),
strategy=shared.Strategy.HI_RES,
split_pdf_page=True,
split_pdf_allow_failed=True,
split_pdf_concurrency_level=15
), )

client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")

response = client.general.partition(request=req)
assert response.status_code == 200
assert len(response.elements)

def test_integration_split_pdf_for_file_with_no_name():
"""
Expand Down
32 changes: 32 additions & 0 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
MAX_CONCURRENCY_LEVEL = 50
MIN_PAGES_PER_SPLIT = 2
MAX_PAGES_PER_SPLIT = 20
HI_RES_STRATEGY = 'hi_res'
MAX_PAGE_LENGTH = 4000


async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]:
Expand Down Expand Up @@ -334,6 +336,8 @@ def before_request(
if split_size >= page_count and page_count == len(pdf.pages):
return request

pdf = self._trim_large_pages(pdf, form_data)

if self.cache_tmp_data_feature:
pdf_chunk_paths = self._get_pdf_chunk_paths(
pdf,
Expand Down Expand Up @@ -423,6 +427,34 @@ async def call_api_partial(

return response

def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader:
if form_data['strategy'] != HI_RES_STRATEGY:
return pdf

max_page_length = MAX_PAGE_LENGTH
any_page_over_maximum_length = False
for page in pdf.pages:
if page.mediabox.height >= max_page_length:
any_page_over_maximum_length = True

# early exit if all pages are safely under the max page length
if not any_page_over_maximum_length:
return pdf

w = PdfWriter()

# trims large pages that exceed the maximum supported height for processing
for page in pdf.pages:
if page.mediabox.height >= max_page_length:
page.mediabox.top = page.mediabox.height
page.mediabox.bottom = page.mediabox.top - max_page_length
w.add_page(page)

chunk_buffer = io.BytesIO()
w.write(chunk_buffer)
chunk_buffer.seek(0)
return PdfReader(chunk_buffer)

def _get_pdf_chunks_in_memory(
self,
pdf: PdfReader,
Expand Down