diff --git a/pypdf/_page.py b/pypdf/_page.py index 081eb8815..2d31afafe 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -28,6 +28,7 @@ # POSSIBILITY OF SUCH DAMAGE. import math +import re import warnings from decimal import Decimal from typing import ( @@ -55,6 +56,7 @@ mult, ) from ._utils import ( + WHITESPACES_AS_REGEXP, CompressedTransformationMatrix, File, ImageFile, @@ -342,6 +344,8 @@ def __init__( DictionaryObject.__init__(self) self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf self.inline_images: Optional[Dict[str, ImageFile]] = None + # below Union for mypy but actually Optional[List[str]] + self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None if indirect_ref is not None: # deprecated warnings.warn( ( @@ -475,8 +479,14 @@ def _old_images(self) -> List[File]: # deprecated def _get_ids_image( self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None ) -> List[Union[str, List[str]]]: - if self.inline_images is None: - self.inline_images = self._get_inline_images() + if self.inline_images_keys is None: + nb_inlines = len( + re.findall( + WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP, + self._get_contents_as_bytes() or b"", + ) + ) + self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)] if obj is None: obj = self if ancest is None: @@ -485,7 +495,7 @@ def _get_ids_image( if PG.RESOURCES not in obj or RES.XOBJECT not in cast( DictionaryObject, obj[PG.RESOURCES] ): - return list(self.inline_images.keys()) + return self.inline_images_keys x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for o in x_object: @@ -493,7 +503,7 @@ def _get_ids_image( lst.append(o if len(ancest) == 0 else ancest + [o]) else: # is a form with possible images inside lst.extend(self._get_ids_image(x_object[o], ancest + [o])) - return lst + list(self.inline_images.keys()) + return lst + self.inline_images_keys def _get_image( self, @@ -515,6 +525,8 @@ def _get_image( raise if isinstance(id, str): if id[0] == "~" and id[-1] == "~": + if self.inline_images is None: + self.inline_images = self._get_inline_images() if self.inline_images is None: # pragma: no cover raise KeyError("no inline image can be found") return self.inline_images[id] @@ -894,6 +906,23 @@ def _add_transformation_matrix( ) return contents + def _get_contents_as_bytes(self) -> Optional[bytes]: + """ + Return the page contents as bytes. + + Returns: + The ``/Contents`` object as bytes, or ``None`` if it doesn't exist. + + """ + if PG.CONTENTS in self: + obj = self[PG.CONTENTS].get_object() + if isinstance(obj, list): + return b"".join(x.get_object().get_data() for x in obj) + else: + return cast(bytes, cast(EncodedStreamObject, obj).get_data()) + else: + return None + def get_contents(self) -> Optional[ContentStream]: """ Access the page contents. diff --git a/pypdf/_utils.py b/pypdf/_utils.py index da121ac55..06845d6ac 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -382,6 +382,7 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00") +WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]" def paeth_predictor(left: int, up: int, up_left: int) -> int: diff --git a/tests/test_workflows.py b/tests/test_workflows.py index c24399f83..1c06c02df 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1016,9 +1016,6 @@ def test_inline_images(): _a[x] = y with pytest.raises(KeyError) as exc: reader.pages[2]._get_image(("test",)) - reader.pages[2].inline_images = None - with pytest.raises(KeyError) as exc: - reader.pages[2]._get_image(("~1~",)) @pytest.mark.enable_socket()