From c04a6bba997d353dbe4efe160596472d36c07604 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Wed, 2 Aug 2023 22:42:18 +0200 Subject: [PATCH] DOC: Post-processing page (#2052) Closes #2046 --- docs/index.rst | 1 + .../post-processing-in-text-extraction.md | 113 ++++++++++++++++++ pypdf/_page.py | 5 +- 3 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 docs/user/post-processing-in-text-extraction.md diff --git a/docs/index.rst b/docs/index.rst index 0bb7d9a8e..c876595ec 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,7 @@ You can contribute to `pypdf on GitHub `_. user/suppress-warnings user/metadata user/extract-text + user/post-processing-in-text-extraction user/extract-images user/extract-attachments user/encryption-decryption diff --git a/docs/user/post-processing-in-text-extraction.md b/docs/user/post-processing-in-text-extraction.md new file mode 100644 index 000000000..464181823 --- /dev/null +++ b/docs/user/post-processing-in-text-extraction.md @@ -0,0 +1,113 @@ +# Post-Processing in Text Extraction + +Post-processing can recognizably improve the results of text extraction. +It is, however, outside of the scope of pypdf itself. Hence the library will +not give any direct support for it. It is a natural language processing (NLP) +task. + +This page lists a few examples what can be done as well as a community +recipie that can be used as a best-practice general purpose post processing +step. If you know more about the specific domain of your documents, e.g. the +language, it is likely that you can find custom solutions that work better in +your context + +## Ligature Replacement + +```python +def replace_ligatures(text: str) -> str: + ligatures = { + "ff": "ff", + "fi": "fi", + "fl": "fl", + "ffi": "ffi", + "ffl": "ffl", + "ſt": "ft", + "st": "st", + # "Ꜳ": "AA", + # "Æ": "AE", + "ꜳ": "aa", + } + for search, replace in ligatures.items(): + text = text.replace(search, replace) + return text +``` + +## De-Hyphenation + +Hyphens are used to break words up so that the appearance of the page is nicer. + +```python +from typing import List + + +def remove_hyphens(text: str) -> str: + """ + + This fails for: + * Natural dashes: well-known, self-replication, use-cases, non-semantic, + Post-processing, Window-wise, viewpoint-dependent + * Trailing math operands: 2 - 4 + * Names: Lopez-Ferreras, VGG-19, CIFAR-100 + """ + lines = [line.rstrip() for line in text.split("\n")] + + # Find dashes + line_numbers = [] + for line_no, line in enumerate(lines[:-1]): + if line.endswith("-"): + line_numbers.append(line_no) + + # Replace + for line_no in line_numbers: + lines = dehyphenate(lines, line_no) + + return "\n".join(lines) + + +def dehyphenate(lines: List[str], line_no: int) -> List[str]: + next_line = lines[line_no + 1] + word_suffix = next_line.split(" ")[0] + + lines[line_no] = lines[line_no][:-1] + word_suffix + lines[line_no + 1] = lines[line_no + 1][len(word_suffix) :] + return lines +``` + +## Header/Footer Removal + +The following header/footer removal has several drawbacks: + +* False-positives, e.g. for the first page when there is a date like 2021. +* False-negatives in many cases: + * Dynamic part, e.g. page label is in the header + * Even/odd pages have different headers + * Some pages, e.g. the first one or chapter pages, don't have a header + +```python +def remove_footer(extracted_texts: list[str], page_labels: list[str]): + def remove_page_labels(extracted_texts, page_labels): + processed = [] + for text, label in zip(extracted_texts, page_labels): + text_left = text.lstrip() + if text_left.startswith(label): + text = text_left[len(label) :] + + text_right = text.rstrip() + if text_right.endswith(label): + text = text_right[: -len(label)] + + processed.append(text) + return processed + + extracted_texts = remove_page_labels(extracted_texts, page_labels) + return extracted_texts +``` + +## Other ideas + +* Whitespaces between Units: Between a number and it's unit should be a space + ([source](https://tex.stackexchange.com/questions/20962/should-i-put-a-space-between-a-number-and-its-unit)). + That means: 42 ms, 42 GHz, 42 GB. +* Percent: English style guides prescribe writing the percent sign following the number without any space between (e.g. 50%). +* Whitespaces before dots: Should typically be removed +* Whitespaces after dots: Should typically be added diff --git a/pypdf/_page.py b/pypdf/_page.py index d70b1b019..01c5b0506 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -557,13 +557,14 @@ def images(self) -> List[ImageFile]: Examples: reader.pages[0].images[0] # return fist image reader.pages[0].images['/I0'] # return image '/I0' - reader.pages[0].images['/TP1','/Image1'] # return image '/Image1' - within '/TP1' Xobject/Form + # return image '/Image1' within '/TP1' Xobject/Form: + reader.pages[0].images['/TP1','/Image1'] for img in reader.pages[0].images: # loop within all objects images.keys() and images.items() can be used. The ImageFile has the following properties: + `.name` : name of the object `.data` : bytes of the object `.image` : PIL Image Object