Skip to content

Commit

Permalink
BUG: _get_fonts not processing properly CIDFonts and annotations (#2194)
Browse files Browse the repository at this point in the history
Closes #2192
  • Loading branch information
pubpub-zz authored Sep 17, 2023
1 parent a85c1c6 commit ea64f5b
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 17 deletions.
91 changes: 74 additions & 17 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2332,7 +2332,9 @@ def _get_fonts(self) -> Tuple[Set[str], Set[str]]:
"""
obj = self.get_object()
assert isinstance(obj, DictionaryObject)
fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj[PG.RESOURCES]))
fonts: Set[str] = set()
embedded: Set[str] = set()
fonts, embedded = _get_fonts_walk(cast(DictionaryObject, obj), fonts, embedded)
unembedded = fonts - embedded
return embedded, unembedded

Expand Down Expand Up @@ -2560,8 +2562,8 @@ def __str__(self) -> str:

def _get_fonts_walk(
obj: DictionaryObject,
fnt: Optional[Set[str]] = None,
emb: Optional[Set[str]] = None,
fnt: Set[str],
emb: Set[str],
) -> Tuple[Set[str], Set[str]]:
"""
Get the set of all fonts and all embedded fonts.
Expand All @@ -2581,22 +2583,77 @@ def _get_fonts_walk(
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
"""
if fnt is None:
fnt = set()
if emb is None:
emb = set()
if not hasattr(obj, "keys"):
return set(), set()
fontkeys = ("/FontFile", "/FontFile2", "/FontFile3")
if "/BaseFont" in obj:
fnt.add(cast(str, obj["/BaseFont"]))
if "/FontName" in obj and [x for x in fontkeys if x in obj]:
# the list comprehension ensures there is FontFile
emb.add(cast(str, obj["/FontName"]))

for key in obj:
_get_fonts_walk(cast(DictionaryObject, obj[key]), fnt, emb)

def process_font(f: DictionaryObject) -> None:
nonlocal fnt, emb
f = cast(DictionaryObject, f.get_object()) # to be sure
if "/BaseFont" in f:
fnt.add(cast(str, f["/BaseFont"]))

if (
("/CharProcs" in f)
or (
"/FontDescriptor" in f
and any(
x in cast(DictionaryObject, f["/FontDescriptor"]) for x in fontkeys
)
)
or (
"/DescendantFonts" in f
and "/FontDescriptor"
in cast(
DictionaryObject,
cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
)
and any(
x
in cast(
DictionaryObject,
cast(
DictionaryObject,
cast(ArrayObject, f["/DescendantFonts"])[0].get_object(),
)["/FontDescriptor"],
)
for x in fontkeys
)
)
):
# the list comprehension ensures there is FontFile
emb.add(cast(str, f["/BaseFont"]))

if "/DR" in obj and "/Font" in cast(DictionaryObject, obj["/DR"]):
for f in cast(DictionaryObject, cast(DictionaryObject, obj["/DR"])["/Font"]):
process_font(f)
if "/Resources" in obj:
if "/Font" in cast(DictionaryObject, obj["/Resources"]):
for f in cast(
DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/Font"]
).values():
process_font(f)
if "/XObject" in cast(DictionaryObject, obj["/Resources"]):
for x in cast(
DictionaryObject, cast(DictionaryObject, obj["/Resources"])["/XObject"]
).values():
_get_fonts_walk(cast(DictionaryObject, x.get_object()), fnt, emb)
if "/Annots" in obj:
for a in cast(ArrayObject, obj["/Annots"]):
_get_fonts_walk(cast(DictionaryObject, a.get_object()), fnt, emb)
if "/AP" in obj:
if (
cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]).get(
"/Type"
)
== "/XObject"
):
_get_fonts_walk(
cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]),
fnt,
emb,
)
else:
for a in cast(DictionaryObject, cast(DictionaryObject, obj["/AP"])["/N"]):
_get_fonts_walk(cast(DictionaryObject, a), fnt, emb)
return fnt, emb # return the sets for each page


Expand Down
45 changes: 45 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,13 @@ def visitor_td(op, args, cm, tm) -> None:
set(),
{"/Helvetica"},
),
# fonts in annotations
(
RESOURCE_ROOT / "FormTestFromOo.pdf",
None,
{"/CAAAAA+LiberationSans", "/EAAAAA+SegoeUI", "/BAAAAA+LiberationSerif"},
{"/LiberationSans", "/ZapfDingbats"},
),
],
)
def test_get_fonts(pdf_path, password, embedded, unembedded):
Expand All @@ -763,6 +770,44 @@ def test_get_fonts(pdf_path, password, embedded, unembedded):
assert (a, b) == (embedded, unembedded)


@pytest.mark.enable_socket()
def test_get_fonts2():
url = "https://github.com/py-pdf/pypdf/files/12618104/WS_T.483.8-2016.pdf"
name = "WS_T.483.8-2016.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.pages[1]._get_fonts() == (
{
"/E-HZ9-PK7483a5-Identity-H",
"/SSJ-PK748200005d9-Identity-H",
"/QGNGZS+FzBookMaker1DlFont10536872415",
"/E-BZ9-PK748344-Identity-H",
"/E-FZ9-PK74836f-Identity-H",
"/O9-PK748464-Identity-H",
"/QGNGZR+FzBookMaker0DlFont00536872414",
"/SSJ-PK748200005db-Identity-H",
"/F-BZ9-PK7483cb-Identity-H",
"/SSJ-PK748200005da-Identity-H",
"/H-SS9-PK748200005e0-Identity-H",
"/H-HT9-PK748200005e1-Identity-H",
},
set(),
)
assert reader.pages[2]._get_fonts() == (
{
"/E-HZ9-PK7483a5-Identity-H",
"/E-FZ9-PK74836f-Identity-H",
"/E-BZ9-PK748344-Identity-H",
"/QGNGZT+FzBookMaker0DlFont00536872418",
"/O9-PK748464-Identity-H",
"/F-BZ9-PK7483cb-Identity-H",
"/H-SS9-PK748200005e0-Identity-H",
"/QGNGZU+FzBookMaker1DlFont10536872420",
"/H-HT9-PK748200005e1-Identity-H",
},
set(),
)


def test_annotation_getter():
pdf_path = RESOURCE_ROOT / "commented.pdf"
reader = PdfReader(pdf_path)
Expand Down

0 comments on commit ea64f5b

Please sign in to comment.