Skip to content

Commit

Permalink
ENH: Accelerate image list keys generation (#2014)
Browse files Browse the repository at this point in the history
Iterating over the images of `009-pdflatex-geotopo/GeoTopo-komprimiert.pdf` was around 2.88s before. With this PR it's reduced to 0.44s.

Especially `X14.jpg` of that PDF took `0.34s` to parse and is now at `0.01s`.

Closes #1987
  • Loading branch information
pubpub-zz authored Jul 28, 2023
1 parent 277643f commit 94f23f9
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 7 deletions.
37 changes: 33 additions & 4 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
# POSSIBILITY OF SUCH DAMAGE.

import math
import re
import warnings
from decimal import Decimal
from typing import (
Expand Down Expand Up @@ -55,6 +56,7 @@
mult,
)
from ._utils import (
WHITESPACES_AS_REGEXP,
CompressedTransformationMatrix,
File,
ImageFile,
Expand Down Expand Up @@ -342,6 +344,8 @@ def __init__(
DictionaryObject.__init__(self)
self.pdf: Union[None, PdfReaderProtocol, PdfWriterProtocol] = pdf
self.inline_images: Optional[Dict[str, ImageFile]] = None
# below Union for mypy but actually Optional[List[str]]
self.inline_images_keys: Optional[List[Union[str, List[str]]]] = None
if indirect_ref is not None: # deprecated
warnings.warn(
(
Expand Down Expand Up @@ -475,8 +479,14 @@ def _old_images(self) -> List[File]: # deprecated
def _get_ids_image(
self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None
) -> List[Union[str, List[str]]]:
if self.inline_images is None:
self.inline_images = self._get_inline_images()
if self.inline_images_keys is None:
nb_inlines = len(
re.findall(
WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
self._get_contents_as_bytes() or b"",
)
)
self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)]
if obj is None:
obj = self
if ancest is None:
Expand All @@ -485,15 +495,15 @@ def _get_ids_image(
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
DictionaryObject, obj[PG.RESOURCES]
):
return list(self.inline_images.keys())
return self.inline_images_keys

x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
for o in x_object:
if x_object[o][IA.SUBTYPE] == "/Image":
lst.append(o if len(ancest) == 0 else ancest + [o])
else: # is a form with possible images inside
lst.extend(self._get_ids_image(x_object[o], ancest + [o]))
return lst + list(self.inline_images.keys())
return lst + self.inline_images_keys

def _get_image(
self,
Expand All @@ -515,6 +525,8 @@ def _get_image(
raise
if isinstance(id, str):
if id[0] == "~" and id[-1] == "~":
if self.inline_images is None:
self.inline_images = self._get_inline_images()
if self.inline_images is None: # pragma: no cover
raise KeyError("no inline image can be found")
return self.inline_images[id]
Expand Down Expand Up @@ -894,6 +906,23 @@ def _add_transformation_matrix(
)
return contents

def _get_contents_as_bytes(self) -> Optional[bytes]:
"""
Return the page contents as bytes.
Returns:
The ``/Contents`` object as bytes, or ``None`` if it doesn't exist.
"""
if PG.CONTENTS in self:
obj = self[PG.CONTENTS].get_object()
if isinstance(obj, list):
return b"".join(x.get_object().get_data() for x in obj)
else:
return cast(bytes, cast(EncodedStreamObject, obj).get_data())
else:
return None

def get_contents(self) -> Optional[ContentStream]:
"""
Access the page contents.
Expand Down
1 change: 1 addition & 0 deletions pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]:


WHITESPACES = (b" ", b"\n", b"\r", b"\t", b"\x00")
WHITESPACES_AS_REGEXP = b"[ \n\r\t\x00]"


def paeth_predictor(left: int, up: int, up_left: int) -> int:
Expand Down
3 changes: 0 additions & 3 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,9 +1016,6 @@ def test_inline_images():
_a[x] = y
with pytest.raises(KeyError) as exc:
reader.pages[2]._get_image(("test",))
reader.pages[2].inline_images = None
with pytest.raises(KeyError) as exc:
reader.pages[2]._get_image(("~1~",))


@pytest.mark.enable_socket()
Expand Down

0 comments on commit 94f23f9

Please sign in to comment.