From f4457249a75ea3a045a278af9aab524f7e8d9016 Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Tue, 28 May 2024 21:53:17 -0700
Subject: [PATCH] fix: `partition_pdf()` removes spaces from the text (#3106)

Closes #2896.

This PR aims to fix `partition_pdf()` to keep spaces in text. The
control character `\t` is now replaced with a space instead of being
removed when merging inferred and embedded elements.

### Testing
PDF:
[rok_20230930_1-1.pdf](https://github.com/Unstructured-IO/unstructured/files/15001636/rok_20230930_1-1.pdf)
```
elements = partition_pdf(
    filename="rok_20230930_1-1.pdf",
    strategy="hi_res",
)

print(str(elements[20]))
```
**Results:**
- PR
```
Name of each exchange on which registered New York Stock Exchange
```
- main branch
```
Nameofeachexchangeonwhichregistered NewYorkStockExchange
```
---
 CHANGELOG.md                                                  | 3 ++-
 test_unstructured/partition/pdf_image/test_pdf_image_utils.py | 2 +-
 unstructured/__version__.py                                   | 2 +-
 unstructured/partition/pdf_image/pdf_image_utils.py           | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8348323432..70fb72e51b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.14.3-dev5
+## 0.14.3
 
 ### Enhancements
 
@@ -10,6 +10,7 @@
 
 ### Fixes
 
+* **Fix `partition_pdf()` to keep spaces in the text**. The control character `\t` is now replaced with a space instead of being removed when merging inferred elements with embedded elements.
 * **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml`
   to avoid text being dynamically injected into the XML document.
 * **Add backward compatibility for the deprecated pdf_infer_table_structure parameter**.
diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
index 0011406109..fc4b49bdf8 100644
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -347,7 +347,7 @@ def test_annotate_layout_elements_file_not_found_error():
 
 @pytest.mark.parametrize(
     ("text", "expected"),
-    [("c\to\x0cn\ftrol\ncharacter\rs\b", "control characters"), ("\"'\\", "\"'\\")],
+    [("test\tco\x0cn\ftrol\ncharacter\rs\b", "test control characters"), ("\"'\\", "\"'\\")],
 )
 def test_remove_control_characters(text, expected):
     assert pdf_image_utils.remove_control_characters(text) == expected
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 6239897e42..541dd86bd1 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.3-dev5"  # pragma: no cover
+__version__ = "0.14.3"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
index d58576e5b6..2c297c2d0b 100644
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -427,7 +427,7 @@ def remove_control_characters(text: str) -> str:
     """Removes control characters from text."""
 
     # Replace newline character with a space
-    text = text.replace("\n", " ")
+    text = text.replace("\t", " ").replace("\n", " ")
     # Remove other control characters
     out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
     return out_text