From f4457249a75ea3a045a278af9aab524f7e8d9016 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Tue, 28 May 2024 21:53:17 -0700 Subject: [PATCH] fix: `partition_pdf()` removes spaces from the text (#3106) Closes #2896. This PR aims to fix `partition_pdf()` to keep spaces in text. The control character `\t` is now replaced with a space instead of being removed when merging inferred and embedded elements. ### Testing PDF: [rok_20230930_1-1.pdf](https://github.com/Unstructured-IO/unstructured/files/15001636/rok_20230930_1-1.pdf) ``` elements = partition_pdf( filename="rok_20230930_1-1.pdf", strategy="hi_res", ) print(str(elements[20])) ``` **Results:** - PR ``` Name of each exchange on which registered New York Stock Exchange ``` - main branch ``` Nameofeachexchangeonwhichregistered NewYorkStockExchange ``` --- CHANGELOG.md | 3 ++- test_unstructured/partition/pdf_image/test_pdf_image_utils.py | 2 +- unstructured/__version__.py | 2 +- unstructured/partition/pdf_image/pdf_image_utils.py | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8348323432..70fb72e51b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.3-dev5 +## 0.14.3 ### Enhancements @@ -10,6 +10,7 @@ ### Fixes +* **Fix `partition_pdf()` to keep spaces in the text**. The control character `\t` is now replaced with a space instead of being removed when merging inferred elements with embedded elements. * **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml` to avoid text being dynamically injected into the XML document. * **Add backward compatibility for the deprecated pdf_infer_table_structure parameter**. diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index 0011406109..fc4b49bdf8 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -347,7 +347,7 @@ def test_annotate_layout_elements_file_not_found_error(): @pytest.mark.parametrize( ("text", "expected"), - [("c\to\x0cn\ftrol\ncharacter\rs\b", "control characters"), ("\"'\\", "\"'\\")], + [("test\tco\x0cn\ftrol\ncharacter\rs\b", "test control characters"), ("\"'\\", "\"'\\")], ) def test_remove_control_characters(text, expected): assert pdf_image_utils.remove_control_characters(text) == expected diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6239897e42..541dd86bd1 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.3-dev5" # pragma: no cover +__version__ = "0.14.3" # pragma: no cover diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index d58576e5b6..2c297c2d0b 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -427,7 +427,7 @@ def remove_control_characters(text: str) -> str: """Removes control characters from text.""" # Replace newline character with a space - text = text.replace("\n", " ") + text = text.replace("\t", " ").replace("\n", " ") # Remove other control characters out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C") return out_text