From 534c7b4bf7052c103d94e5761631979e23411e39 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 17:28:55 +0200 Subject: [PATCH] ENH: Process /uniHHHH for text_extract (#2043) `/uniHHHH` (H is a hexadecimal) glyphs seems to be generated in LaTeX but is ok for other characters This was mentioned in #2016 / #2038 --- pypdf/_cmap.py | 15 ++++++++++++--- tests/test_cmap.py | 10 ++++++++++ tests/test_encryption.py | 22 +++++++++++++++++----- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 14c1e229c..6400f89b1 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -461,10 +461,19 @@ def type1_alternative( continue try: i = int(words[1]) - v = adobe_glyphs[words[2].decode()] - except (ValueError, KeyError): + except ValueError: # pragma: no cover continue - if v == " ": + try: + v = adobe_glyphs[words[2].decode()] + except KeyError: + if words[2].startswith(b"/uni"): + try: + v = chr(int(words[2][4:], 16)) + except ValueError: # pragma: no cover + continue + else: + continue + if words[2].decode() == b" ": space_code = i map_dict[chr(i)] = v int_entry.append(i) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index f74da326d..6e7448651 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -179,3 +179,13 @@ def test_latex(): for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"): assert pat in txt # actually the ϕ and φ seems to be crossed in latex + + +@pytest.mark.enable_socket() +def test_unixxx_glyphs(): + url = "https://arxiv.org/pdf/2201.00021.pdf" + name = "unixxx_glyphs.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + txt = reader.pages[0].extract_text() # no error + for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"): + assert pat in txt diff --git a/tests/test_encryption.py b/tests/test_encryption.py index ff33d2121..6641977a4 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -125,7 +125,10 @@ def test_encryption(name, requires_pycryptodome): ("r6-both-passwords.pdf", "foo", "bar"), ], ) -@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography") +@pytest.mark.skipif( + not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, + reason="No pycryptodome / cryptography", +) def test_pdf_with_both_passwords(name, user_passwd, owner_passwd): """ PDFs with both user and owner passwords are handled correctly. @@ -151,7 +154,10 @@ def test_pdf_with_both_passwords(name, user_passwd, owner_passwd): ("crazyones-encrypted-256.pdf", b"password"), ], ) -@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography") +@pytest.mark.skipif( + not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, + reason="No pycryptodome / cryptography", +) def test_read_page_from_encrypted_file_aes_256(pdffile, password): """ A page can be read from an encrypted. @@ -176,7 +182,10 @@ def test_read_page_from_encrypted_file_aes_256(pdffile, password): ), ], ) -@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography") +@pytest.mark.skipif( + not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, + reason="No pycryptodome / cryptography", +) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_encrypted_pdfs(names): """Encrypted PDFs can be merged after decryption.""" @@ -193,7 +202,7 @@ def test_merge_encrypted_pdfs(names): @pytest.mark.skipif( HAS_CRYPTOGRAPHY, - reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494" + reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494", ) @pytest.mark.parametrize( "cryptcls", @@ -346,7 +355,10 @@ def test_pdf_encrypt_multiple(pdf_file_path, count): assert text0 == text1 -@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography") +@pytest.mark.skipif( + not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, + reason="No pycryptodome / cryptography", +) def test_aes_decrypt_corrupted_data(): """Just for robustness""" aes = CryptAES(secrets.token_bytes(16))