From fb4f466ed9bd148d9677658c00e69bd8fc4fb3b5 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 12 Aug 2023 12:30:52 +0200 Subject: [PATCH] BUG: Prevent stall when accessing image in corrupted pdf (#2081) Closes #2077 --- pypdf/_page.py | 14 ++++++++++++-- tests/test_images.py | 10 ++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 01c5b0506..00800a9bf 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -477,8 +477,18 @@ def _old_images(self) -> List[File]: # deprecated return images_extracted def _get_ids_image( - self, obj: Optional[DictionaryObject] = None, ancest: Optional[List[str]] = None + self, + obj: Optional[DictionaryObject] = None, + ancest: Optional[List[str]] = None, + call_stack: Optional[List[Any]] = None, ) -> List[Union[str, List[str]]]: + if call_stack is None: + call_stack = [] + _i = getattr(obj, "indirect_reference", None) + if _i in call_stack: + return [] + else: + call_stack.append(_i) if self.inline_images_keys is None: nb_inlines = len( re.findall( @@ -502,7 +512,7 @@ def _get_ids_image( if x_object[o][IA.SUBTYPE] == "/Image": lst.append(o if len(ancest) == 0 else ancest + [o]) else: # is a form with possible images inside - lst.extend(self._get_ids_image(x_object[o], ancest + [o])) + lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) return lst + self.inline_images_keys def _get_image( diff --git a/tests/test_images.py b/tests/test_images.py index b159af0d3..2f14c7b38 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -209,3 +209,13 @@ def test_image_extraction(src, page_index, image_key, expected): with open(f"page-{page_index}-{actual_image.name}", "wb") as fp: fp.write(actual_image.data) assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 + + +@pytest.mark.enable_socket() +@pytest.mark.timeout(30) +def test_loop_in_image_keys(): + """Cf #2077""" + url = "https://github.com/py-pdf/pypdf/files/12309492/example_134.pdf" + name = "iss2077.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + reader.pages[0].images.keys()