diff --git a/docs/user/extract-images.md b/docs/user/extract-images.md index 3bce343a3..1873778ae 100644 --- a/docs/user/extract-images.md +++ b/docs/user/extract-images.md @@ -19,3 +19,25 @@ for image_file_object in page.images: fp.write(image_file_object.data) count += 1 ``` + +# Other images + +Some other objects can contain images, such as stamp annotations. + +For example, this document contains such stamps: +[test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf) + +You can extract the image from the annotation with the following code: + +```python +from pypdf import PdfReader + +reader = PdfReader("test_stamp.pdf") +im = ( + reader.pages[0]["/Annots"][0] + .get_object()["/AP"]["/N"]["/Resources"]["/XObject"]["/Im4"] + .decode_as_image() +) + +im.show() +``` diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 1688d5d5c..cedde3c8f 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -955,6 +955,31 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject": retval._data = FlateDecode.encode(b_(self._data), level) return retval + def decode_as_image(self) -> Any: + """ + Try to decode the stream object as an image + + Returns: + a PIL image if proper decoding has been found + Raises: + Exception: (any)during decoding to to invalid object or + errors during decoding will be reported + It is recommended to catch exceptions to prevent + stops in your program. + """ + from ..filters import _xobj_to_image + + if self.get("/Subtype", "") != "/Image": + try: + msg = f"{self.indirect_reference} does not seem to be an Image" # pragma: no cover + except AttributeError: + msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover + logger_warning(msg, __name__) + extension, byte_stream, img = _xobj_to_image(self) + if extension is None: + return None # pragma: no cover + return img + class DecodedStreamObject(StreamObject): pass diff --git a/tests/test_images.py b/tests/test_images.py index 5982ecf20..5955bf47c 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -441,3 +441,24 @@ def test_inline_image_extraction(): name = "iss2598d.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[0].image, img) == 1 + + +@pytest.mark.enable_socket() +def test_extract_image_from_object(caplog): + url = "https://github.com/py-pdf/pypdf/files/15176076/B2.pdf" + name = "iss2613.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + image = reader.pages[0]["/Resources"]["/Pattern"]["/P1"]["/Resources"]["/XObject"][ + "/X1" + ].decode_as_image() + assert isinstance(image, Image.Image) + with pytest.raises(Exception): + co = reader.pages[0].get_contents() + co.decode_as_image() + assert "does not seem to be an Image" in caplog.text + caplog.clear() + co.indirect_reference = "for_test" + with pytest.raises(Exception): + co = reader.pages[0].get_contents() + co.decode_as_image() + assert "does not seem to be an Image" in caplog.text