diff --git a/pypdf/_page.py b/pypdf/_page.py index b351749ea..daf988d47 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2332,7 +2332,9 @@ def _get_fonts(self) -> Tuple[Set[str], Set[str]]: """ obj = self.get_object() assert isinstance(obj, DictionaryObject) - fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj[PG.RESOURCES])) + fonts: Set[str] = set() + embedded: Set[str] = set() + fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj), fonts, embedded) unembedded = fonts - embedded return embedded, unembedded @@ -2560,8 +2562,8 @@ def __str__(self) -> str: def _get_fonts_walk( obj: DictionaryObject, - fnt: Optional[Set[str]] = None, - emb: Optional[Set[str]] = None, + fnt: Set[str], + emb: Set[str], ) -> Tuple[Set[str], Set[str]]: """ Get the set of all fonts and all embedded fonts. @@ -2581,22 +2583,77 @@ def _get_fonts_walk( We create and add to two sets, fnt = fonts used and emb = fonts embedded. """ - if fnt is None: - fnt = set() - if emb is None: - emb = set() - if not hasattr(obj, "keys"): - return set(), set() fontkeys = ("/FontFile", "/FontFile2", "/FontFile3") - if "/BaseFont" in obj: - fnt.add(cast(str, obj["/BaseFont"])) - if "/FontName" in obj and [x for x in fontkeys if x in obj]: - # the list comprehension ensures there is FontFile - emb.add(cast(str, obj["/FontName"])) - - for key in obj: - _get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb) + def process_font(f: DictionaryObject) -> None: + nonlocal fnt, emb + f = cast(DictionaryObject, f.get_object()) # to be sure + if "/BaseFont" in f: + fnt.add(cast(str, f["/BaseFont"])) + + if ( + ("/CharProcs" in f) + or ( + "/FontDescriptor" in f + and any( + x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys + ) + ) + or ( + "/DescendantFonts" in f + and "/FontDescriptor" + in cast( + DictionaryObject, + cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), + ) + and any( + x + in cast( + DictionaryObject, + cast( + DictionaryObject, + cast(ArrayObject, f["/DescendantFonts"])[0].get_object(), + )["/FontDescriptor"], + ) + for x in fontkeys + ) + ) + ): + # the list comprehension ensures there is FontFile + emb.add(cast(str, f["/BaseFont"])) + + if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]): + for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]): + process_font(f) + if "/Resources" in obj: + if "/Font" in cast(DictionaryObject, obj["/Resources"]): + for f in cast( + DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"] + ).values(): + process_font(f) + if "/XObject" in cast(DictionaryObject, obj["/Resources"]): + for x in cast( + DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"] + ).values(): + _get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb) + if "/Annots" in obj: + for a in cast(ArrayObject, obj["/Annots"]): + _get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb) + if "/AP" in obj: + if ( + cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get( + "/Type" + ) + == "/XObject" + ): + _get_fonts_walk( + cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]), + fnt, + emb, + ) + else: + for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]): + _get_fonts_walk(cast(DictionaryObject, a), fnt, emb) return fnt, emb # return the sets for each page diff --git a/tests/test_page.py b/tests/test_page.py index fb916ea49..1d6c49443 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -750,6 +750,13 @@ def visitor_td(op, args, cm, tm) -> None: set(), {"/Helvetica"}, ), + # fonts in annotations + ( + RESOURCE_ROOT / "FormTestFromOo.pdf", + None, + {"/CAAAAA+LiberationSans", "/EAAAAA+SegoeUI", "/BAAAAA+LiberationSerif"}, + {"/LiberationSans", "/ZapfDingbats"}, + ), ], ) def test_get_fonts(pdf_path, password, embedded, unembedded): @@ -763,6 +770,44 @@ def test_get_fonts(pdf_path, password, embedded, unembedded): assert (a, b) == (embedded, unembedded) +@pytest.mark.enable_socket() +def test_get_fonts2(): + url = "https://github.com/py-pdf/pypdf/files/12618104/WS_T.483.8-2016.pdf" + name = "WS_T.483.8-2016.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert reader.pages[1]._get_fonts() == ( + { + "/E-HZ9-PK7483a5-Identity-H", + "/SSJ-PK748200005d9-Identity-H", + "/QGNGZS+FzBookMaker1DlFont10536872415", + "/E-BZ9-PK748344-Identity-H", + "/E-FZ9-PK74836f-Identity-H", + "/O9-PK748464-Identity-H", + "/QGNGZR+FzBookMaker0DlFont00536872414", + "/SSJ-PK748200005db-Identity-H", + "/F-BZ9-PK7483cb-Identity-H", + "/SSJ-PK748200005da-Identity-H", + "/H-SS9-PK748200005e0-Identity-H", + "/H-HT9-PK748200005e1-Identity-H", + }, + set(), + ) + assert reader.pages[2]._get_fonts() == ( + { + "/E-HZ9-PK7483a5-Identity-H", + "/E-FZ9-PK74836f-Identity-H", + "/E-BZ9-PK748344-Identity-H", + "/QGNGZT+FzBookMaker0DlFont00536872418", + "/O9-PK748464-Identity-H", + "/F-BZ9-PK7483cb-Identity-H", + "/H-SS9-PK748200005e0-Identity-H", + "/QGNGZU+FzBookMaker1DlFont10536872420", + "/H-HT9-PK748200005e1-Identity-H", + }, + set(), + ) + + def test_annotation_getter(): pdf_path = RESOURCE_ROOT / "commented.pdf" reader = PdfReader(pdf_path)