Skip to content

Commit

Permalink
adding logic to split pages that are too large to process
Browse files Browse the repository at this point in the history
  • Loading branch information
jordan-homan committed Nov 21, 2024
1 parent a9b7b0b commit 3e15249
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 0 deletions.
Binary file added _sample_docs/super_long_pages.pdf
Binary file not shown.
14 changes: 14 additions & 0 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,21 @@ def test_integration_split_pdf_with_caching(
if cache_dir:
assert not Path(cache_dir).exists()

@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"])
def test_long_pages_hi_res(filename):
req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters(
files=shared.Files(content=open(filename, "rb"), file_name=filename, ),
strategy=shared.Strategy.HI_RES,
split_pdf_page=True,
split_pdf_allow_failed=True,
split_pdf_concurrency_level=15
), )

client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")

response = client.general.partition(request=req)
assert response.status_code == 200
assert len(response.elements)

def test_integration_split_pdf_for_file_with_no_name():
"""
Expand Down
48 changes: 48 additions & 0 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,8 @@ def before_request(
fallback_value=DEFAULT_CACHE_TMP_DATA_DIR,
)

pdf = self._clean_large_pages(pdf)

page_range_start, page_range_end = form_utils.get_page_range(
form_data,
key=PARTITION_FORM_PAGE_RANGE_KEY.replace("[]", ""),
Expand Down Expand Up @@ -423,6 +425,52 @@ async def call_api_partial(

return response

def _clean_large_pages(self,
pdf: PdfReader) -> PdfReader:
max_page_length = 4000
any_page_over_maximum_length = False
for page in pdf.pages:
if page.mediabox.height >= max_page_length:
any_page_over_maximum_length = True

# early exit if all pages are safely under the max page length
if not any_page_over_maximum_length:
return pdf

w = PdfWriter()
page_nums = 0

map_of_pages_to_clean = {}
for page in pdf.pages:
if page.mediabox.height <= max_page_length:
page_nums += 1
w.add_page(page)
continue

num_pages_to_add = math.ceil(page.mediabox.height / max_page_length)

page_start = page.mediabox.height
page_end = page_start - max_page_length
for _ in range(num_pages_to_add):
page_nums += 1
map_of_pages_to_clean[page_nums] = {'top': page_start, 'bottom': page_end}
w.add_page(page)
page_start = page_end
page_end = page_start - max_page_length

page_nums = 0
for page in w.pages:
page_nums += 1
if map_of_pages_to_clean[page_nums] is None:
continue
page.mediabox.top = map_of_pages_to_clean[page_nums]['top']
page.mediabox.bottom = map_of_pages_to_clean[page_nums]['bottom']

chunk_buffer = io.BytesIO()
w.write(chunk_buffer)
chunk_buffer.seek(0)
return PdfReader(chunk_buffer)

def _get_pdf_chunks_in_memory(
self,
pdf: PdfReader,
Expand Down

0 comments on commit 3e15249

Please sign in to comment.