Skip to content

Commit

Permalink
ENH: Process /uniHHHH for text_extract (#2043)
Browse files Browse the repository at this point in the history
`/uniHHHH` (H is a hexadecimal) glyphs seems to be generated in LaTeX but is ok for other characters

This was mentioned in #2016 / #2038
  • Loading branch information
pubpub-zz authored Jul 30, 2023
1 parent f617f69 commit 534c7b4
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 8 deletions.
15 changes: 12 additions & 3 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,19 @@ def type1_alternative(
continue
try:
i = int(words[1])
v = adobe_glyphs[words[2].decode()]
except (ValueError, KeyError):
except ValueError: # pragma: no cover
continue
if v == " ":
try:
v = adobe_glyphs[words[2].decode()]
except KeyError:
if words[2].startswith(b"/uni"):
try:
v = chr(int(words[2][4:], 16))
except ValueError: # pragma: no cover
continue
else:
continue
if words[2].decode() == b" ":
space_code = i
map_dict[chr(i)] = v
int_entry.append(i)
Expand Down
10 changes: 10 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,13 @@ def test_latex():
for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"):
assert pat in txt
# actually the ϕ and φ seems to be crossed in latex


@pytest.mark.enable_socket()
def test_unixxx_glyphs():
url = "https://arxiv.org/pdf/2201.00021.pdf"
name = "unixxx_glyphs.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
txt = reader.pages[0].extract_text() # no error
for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"):
assert pat in txt
22 changes: 17 additions & 5 deletions tests/test_encryption.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,10 @@ def test_encryption(name, requires_pycryptodome):
("r6-both-passwords.pdf", "foo", "bar"),
],
)
@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography")
@pytest.mark.skipif(
not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY,
reason="No pycryptodome / cryptography",
)
def test_pdf_with_both_passwords(name, user_passwd, owner_passwd):
"""
PDFs with both user and owner passwords are handled correctly.
Expand All @@ -151,7 +154,10 @@ def test_pdf_with_both_passwords(name, user_passwd, owner_passwd):
("crazyones-encrypted-256.pdf", b"password"),
],
)
@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography")
@pytest.mark.skipif(
not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY,
reason="No pycryptodome / cryptography",
)
def test_read_page_from_encrypted_file_aes_256(pdffile, password):
"""
A page can be read from an encrypted.
Expand All @@ -176,7 +182,10 @@ def test_read_page_from_encrypted_file_aes_256(pdffile, password):
),
],
)
@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography")
@pytest.mark.skipif(
not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY,
reason="No pycryptodome / cryptography",
)
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_merge_encrypted_pdfs(names):
"""Encrypted PDFs can be merged after decryption."""
Expand All @@ -193,7 +202,7 @@ def test_merge_encrypted_pdfs(names):

@pytest.mark.skipif(
HAS_CRYPTOGRAPHY,
reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494"
reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494",
)
@pytest.mark.parametrize(
"cryptcls",
Expand Down Expand Up @@ -346,7 +355,10 @@ def test_pdf_encrypt_multiple(pdf_file_path, count):
assert text0 == text1


@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography")
@pytest.mark.skipif(
not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY,
reason="No pycryptodome / cryptography",
)
def test_aes_decrypt_corrupted_data():
"""Just for robustness"""
aes = CryptAES(secrets.token_bytes(16))
Expand Down

0 comments on commit 534c7b4

Please sign in to comment.