From 854c46785a27cb6d081b89ddf004436aacdf0fe0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 1 May 2024 14:15:57 +0200 Subject: [PATCH 1/8] ENH: add decode_as_image() to ContentStreams closes #2613 --- pypdf/generic/_data_structures.py | 20 ++++++++++++++++++++ tests/test_images.py | 21 +++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 3ca761403..cae9a2c04 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -947,6 +947,26 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject": retval._data = FlateDecode.encode(b_(self._data), level) return retval + def decode_as_image(self) -> Any: + """ + Try to decode the stream object as an image + + Returns: + a PIL image if proper decoding has been found + """ + from ..filters import _xobj_to_image + + if self.get("/Subtype", "") != "/Image": + try: + msg = f"{self.indirect_reference} does not seems to be an Image" # pragma: no cover + except AttributeError: + msg = f"{self.__repr__()} object does not seems to be an Image" # pragma: no cover + logger_warning(msg, __name__) + extension, byte_stream, img = _xobj_to_image(self) + if extension is None: + return None # pragma: no cover + return img + class DecodedStreamObject(StreamObject): pass diff --git a/tests/test_images.py b/tests/test_images.py index ad694d669..df64d0cfe 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -346,3 +346,24 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr): print(fn) # noqa: T201 img = Image.open(BytesIO(zf.read(fn))) assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99 + + +@pytest.mark.enable_socket() +def test_extract_image_from_object(caplog): + url = "https://github.com/py-pdf/pypdf/files/15176076/B2.pdf" + name = "iss2613.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + image = reader.pages[0]["/Resources"]["/Pattern"]["/P1"]["/Resources"]["/XObject"][ + "/X1" + ].decode_as_image() + assert isinstance(image, Image.Image) + with pytest.raises(Exception): + co = reader.pages[0].get_contents() + co.decode_as_image() + assert "does not seems to be an Image" in caplog.text + caplog.clear() + co.indirect_reference = "for_test" + with pytest.raises(Exception): + co = reader.pages[0].get_contents() + co.decode_as_image() + assert "does not seems to be an Image" in caplog.text From 0fb2a735e46cdb67e6652168711003d4e26f69a5 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 1 May 2024 18:26:54 +0200 Subject: [PATCH 2/8] add annotation about exceptions --- pypdf/generic/_data_structures.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index cae9a2c04..bac4c4a23 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -953,6 +953,11 @@ def decode_as_image(self) -> Any: Returns: a PIL image if proper decoding has been found + Raises: + Exceptions during decoding to to invalid object or + errors during decoding will be reported + It is recommended to catch exceptions to prevent + stops in your program. """ from ..filters import _xobj_to_image From 6b83ef5b633d0c9f405a0e31ae9c2ab75d80a636 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 1 May 2024 22:50:55 +0200 Subject: [PATCH 3/8] fix doc --- pypdf/generic/_data_structures.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index bac4c4a23..8f524ca08 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -954,10 +954,10 @@ def decode_as_image(self) -> Any: Returns: a PIL image if proper decoding has been found Raises: - Exceptions during decoding to to invalid object or - errors during decoding will be reported - It is recommended to catch exceptions to prevent - stops in your program. + Exception: (any)during decoding to to invalid object or + errors during decoding will be reported + It is recommended to catch exceptions to prevent + stops in your program. """ from ..filters import _xobj_to_image From 57c787c0fa356bd0c2345412303e52bd394f8b99 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 9 Jun 2024 11:56:00 +0200 Subject: [PATCH 4/8] Update pypdf/generic/_data_structures.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/generic/_data_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 9a3cc0c99..2ba280ccb 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -971,7 +971,7 @@ def decode_as_image(self) -> Any: if self.get("/Subtype", "") != "/Image": try: - msg = f"{self.indirect_reference} does not seems to be an Image" # pragma: no cover + msg = f"{self.indirect_reference} does not seem to be an Image" # pragma: no cover except AttributeError: msg = f"{self.__repr__()} object does not seems to be an Image" # pragma: no cover logger_warning(msg, __name__) From 66aa39440301a083c37d1aa55d229577b357b6e7 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 9 Jun 2024 11:56:09 +0200 Subject: [PATCH 5/8] Update pypdf/generic/_data_structures.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/generic/_data_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 2ba280ccb..cedde3c8f 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -973,7 +973,7 @@ def decode_as_image(self) -> Any: try: msg = f"{self.indirect_reference} does not seem to be an Image" # pragma: no cover except AttributeError: - msg = f"{self.__repr__()} object does not seems to be an Image" # pragma: no cover + msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover logger_warning(msg, __name__) extension, byte_stream, img = _xobj_to_image(self) if extension is None: From 9ced094b6209f2e5d9ecb500028d463b02bf54d3 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 9 Jun 2024 12:29:26 +0200 Subject: [PATCH 6/8] fix test + add documentation --- docs/user/extract-images.md | 23 +++++++++++++++++++++++ tests/test_images.py | 6 +++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/docs/user/extract-images.md b/docs/user/extract-images.md index 3bce343a3..91d8f4611 100644 --- a/docs/user/extract-images.md +++ b/docs/user/extract-images.md @@ -19,3 +19,26 @@ for image_file_object in page.images: fp.write(image_file_object.data) count += 1 ``` + +# Other images + +Some other objects can contain images, such as stamp annotations. + +For example, this document contains such stamps: + +[test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf) + +you can extract the image from the annotation with the following code + +```python +from pypdf import PdfReader + +reader = PdfReader("test_stamp.pdf") +im = ( + reader.pages[0]["/Annots"][0] + .get_object()["/AP"]["/N"]["/Resources"]["/XObject"]["/Im4"] + .decode_as_image() +) + +im.show() +``` diff --git a/tests/test_images.py b/tests/test_images.py index d802f4bb4..5955bf47c 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -442,7 +442,7 @@ def test_inline_image_extraction(): img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[0].image, img) == 1 - + @pytest.mark.enable_socket() def test_extract_image_from_object(caplog): url = "https://github.com/py-pdf/pypdf/files/15176076/B2.pdf" @@ -455,10 +455,10 @@ def test_extract_image_from_object(caplog): with pytest.raises(Exception): co = reader.pages[0].get_contents() co.decode_as_image() - assert "does not seems to be an Image" in caplog.text + assert "does not seem to be an Image" in caplog.text caplog.clear() co.indirect_reference = "for_test" with pytest.raises(Exception): co = reader.pages[0].get_contents() co.decode_as_image() - assert "does not seems to be an Image" in caplog.text + assert "does not seem to be an Image" in caplog.text From 561412df6ef31d6f8201851a08148c0452128e9c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 9 Jun 2024 12:33:31 +0200 Subject: [PATCH 7/8] Update docs/user/extract-images.md Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- docs/user/extract-images.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/extract-images.md b/docs/user/extract-images.md index 91d8f4611..5fccee681 100644 --- a/docs/user/extract-images.md +++ b/docs/user/extract-images.md @@ -28,7 +28,7 @@ For example, this document contains such stamps: [test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf) -you can extract the image from the annotation with the following code +You can extract the image from the annotation with the following code: ```python from pypdf import PdfReader From 604e2b81e06f122736b1471d5a5202f834a8d0f2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 9 Jun 2024 12:34:37 +0200 Subject: [PATCH 8/8] style --- docs/user/extract-images.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/user/extract-images.md b/docs/user/extract-images.md index 5fccee681..1873778ae 100644 --- a/docs/user/extract-images.md +++ b/docs/user/extract-images.md @@ -25,7 +25,6 @@ for image_file_object in page.images: Some other objects can contain images, such as stamp annotations. For example, this document contains such stamps: - [test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf) You can extract the image from the annotation with the following code: