ENH: Accelerate image list keys generation (#2014)

Iterating over the images of `009-pdflatex-geotopo/GeoTopo-komprimiert.pdf` was around 2.88s before. With this PR it's reduced to 0.44s. Especially `X14.jpg` of that PDF took `0.34s` to parse and is now at `0.01s`. Closes #1987
py-pdf · Jul 28, 2023 · 94f23f9 · 94f23f9
1 parent 277643f
commit 94f23f9
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 7 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -28,6 +28,7 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 import math
+import re
 import warnings
 from decimal import Decimal
 from typing import (
@@ -55,6 +56,7 @@
  mult,
 )
 from ._utils import (
+ WHITESPACES_AS_REGEXP,
  CompressedTransformationMatrix,
  File,
  ImageFile,
@@ -342,6 +344,8 @@ def __init__(
  DictionaryObject.__init__(self)
  self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf
  self.inline_images: Optional[Dict[str, ImageFile]] = None
+ # below Union for mypy but actually Optional[List[str]]
+ self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None
  if indirect_ref is not None: # deprecated
  warnings.warn(
  (
@@ -475,8 +479,14 @@ def _old_images(self) -> List[File]: # deprecated
  def _get_ids_image(
  self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None
  ) -> List[Union[str, List[str]]]:
- if self.inline_images is None:
- self.inline_images = self._get_inline_images()
+ if self.inline_images_keys is None:
+ nb_inlines = len(
+ re.findall(
+ WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
+ self._get_contents_as_bytes() or b"",
+ )
+ )
+ self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)]
  if obj is None:
  obj = self
  if ancest is None:
@@ -485,15 +495,15 @@ def _get_ids_image(
  if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
  DictionaryObject, obj[PG.RESOURCES]
  ):
- return list(self.inline_images.keys())
+ return self.inline_images_keys
 
  x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
  for o in x_object:
  if x_object[o][IA.SUBTYPE] == "/Image":
  lst.append(o if len(ancest) == 0 else ancest + [o])
  else: # is a form with possible images inside
  lst.extend(self._get_ids_image(x_object[o], ancest + [o]))
- return lst + list(self.inline_images.keys())
+ return lst + self.inline_images_keys
 
  def _get_image(
  self,
@@ -515,6 +525,8 @@ def _get_image(
  raise
  if isinstance(id, str):
  if id[0] == "~" and id[-1] == "~":
+ if self.inline_images is None:
+ self.inline_images = self._get_inline_images()
  if self.inline_images is None: # pragma: no cover
  raise KeyError("no inline image can be found")
  return self.inline_images[id]
@@ -894,6 +906,23 @@ def _add_transformation_matrix(
  )
  return contents
 
+ def _get_contents_as_bytes(self) -> Optional[bytes]:
+ """
+ Return the page contents as bytes.
+
+ Returns:
+ The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
+
+ """
+ if PG.CONTENTS in self:
+ obj = self[PG.CONTENTS].get_object()
+ if isinstance(obj, list):
+ return b"".join(x.get_object().get_data() for x in obj)
+ else:
+ return cast(bytes, cast(EncodedStreamObject, obj).get_data())
+ else:
+ return None
+
  def get_contents(self) -> Optional[ContentStream]:
  """
  Access the page contents.

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -382,6 +382,7 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:
 
 
 WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00")
+WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]"
 
 
 def paeth_predictor(left: int, up: int, up_left: int) -> int:

diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -1016,9 +1016,6 @@ def test_inline_images():
  _a[x] = y
  with pytest.raises(KeyError) as exc:
  reader.pages[2]._get_image(("test",))
- reader.pages[2].inline_images = None
- with pytest.raises(KeyError) as exc:
- reader.pages[2]._get_image(("~1~",))
 
 
 @pytest.mark.enable_socket()