fix: partition_pdf() removes spaces from the text (#3106)

Closes #2896. This PR aims to fix `partition_pdf()` to keep spaces in text. The control character `\t` is now replaced with a space instead of being removed when merging inferred and embedded elements. ### Testing PDF: [rok_20230930_1-1.pdf](https://github.com/Unstructured-IO/unstructured/files/15001636/rok_20230930_1-1.pdf) ``` elements = partition_pdf( filename="rok_20230930_1-1.pdf", strategy="hi_res", ) print(str(elements[20])) ``` **Results:** - PR ``` Name of each exchange on which registered New York Stock Exchange ``` - main branch ``` Nameofeachexchangeonwhichregistered NewYorkStockExchange ```
Unstructured-IO · May 29, 2024 · f445724 · f445724
1 parent 3158169
commit f445724
Show file tree

Hide file tree

Showing 4 changed files with 5 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.14.3-dev5
+## 0.14.3
 
 ### Enhancements
 
@@ -10,6 +10,7 @@
 
 ### Fixes
 
+* **Fix `partition_pdf()` to keep spaces in the text**. The control character `\t` is now replaced with a space instead of being removed when merging inferred elements with embedded elements.
 * **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml`
   to avoid text being dynamically injected into the XML document.
 * **Add backward compatibility for the deprecated pdf_infer_table_structure parameter**.

diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -347,7 +347,7 @@ def test_annotate_layout_elements_file_not_found_error():
 
 @pytest.mark.parametrize(
     ("text", "expected"),
-    [("c\to\x0cn\ftrol\ncharacter\rs\b", "control characters"), ("\"'\\", "\"'\\")],
+    [("test\tco\x0cn\ftrol\ncharacter\rs\b", "test control characters"), ("\"'\\", "\"'\\")],
 )
 def test_remove_control_characters(text, expected):
     assert pdf_image_utils.remove_control_characters(text) == expected
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.3-dev5"  # pragma: no cover
+__version__ = "0.14.3"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -427,7 +427,7 @@ def remove_control_characters(text: str) -> str:
     """Removes control characters from text."""
 
     # Replace newline character with a space
-    text = text.replace("\n", " ")
+    text = text.replace("\t", " ").replace("\n", " ")
     # Remove other control characters
     out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
     return out_text