ENH: Process /uniHHHH for text_extract (#2043)

`/uniHHHH` (H is a hexadecimal) glyphs seems to be generated in LaTeX but is ok for other characters This was mentioned in #2016 / #2038
py-pdf · Jul 30, 2023 · 534c7b4 · 534c7b4
1 parent f617f69
commit 534c7b4
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 8 deletions.
diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -461,10 +461,19 @@ def type1_alternative(
  continue
  try:
  i = int(words[1])
- v = adobe_glyphs[words[2].decode()]
- except (ValueError, KeyError):
+ except ValueError: # pragma: no cover
  continue
- if v == " ":
+ try:
+ v = adobe_glyphs[words[2].decode()]
+ except KeyError:
+ if words[2].startswith(b"/uni"):
+ try:
+ v = chr(int(words[2][4:], 16))
+ except ValueError: # pragma: no cover
+ continue
+ else:
+ continue
+ if words[2].decode() == b" ":
  space_code = i
  map_dict[chr(i)] = v
  int_entry.append(i)

diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -179,3 +179,13 @@ def test_latex():
  for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"):
  assert pat in txt
  # actually the ϕ and φ seems to be crossed in latex
+
+
+@pytest.mark.enable_socket()
+def test_unixxx_glyphs():
+ url = "https://arxiv.org/pdf/2201.00021.pdf"
+ name = "unixxx_glyphs.pdf"
+ reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+ txt = reader.pages[0].extract_text() # no error
+ for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"):
+ assert pat in txt
diff --git a/tests/test_encryption.py b/tests/test_encryption.py
@@ -125,7 +125,10 @@ def test_encryption(name, requires_pycryptodome):
  ("r6-both-passwords.pdf", "foo", "bar"),
  ],
 )
-@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography")
+@pytest.mark.skipif(
+ not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY,
+ reason="No pycryptodome / cryptography",
+)
 def test_pdf_with_both_passwords(name, user_passwd, owner_passwd):
  """
  PDFs with both user and owner passwords are handled correctly.
@@ -151,7 +154,10 @@ def test_pdf_with_both_passwords(name, user_passwd, owner_passwd):
  ("crazyones-encrypted-256.pdf", b"password"),
  ],
 )
-@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography")
+@pytest.mark.skipif(
+ not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY,
+ reason="No pycryptodome / cryptography",
+)
 def test_read_page_from_encrypted_file_aes_256(pdffile, password):
  """
  A page can be read from an encrypted.
@@ -176,7 +182,10 @@ def test_read_page_from_encrypted_file_aes_256(pdffile, password):
  ),
  ],
 )
-@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography")
+@pytest.mark.skipif(
+ not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY,
+ reason="No pycryptodome / cryptography",
+)
 @pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_merge_encrypted_pdfs(names):
  """Encrypted PDFs can be merged after decryption."""
@@ -193,7 +202,7 @@ def test_merge_encrypted_pdfs(names):
 
 @pytest.mark.skipif(
  HAS_CRYPTOGRAPHY,
- reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494"
+ reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494",
 )
 @pytest.mark.parametrize(
  "cryptcls",
@@ -346,7 +355,10 @@ def test_pdf_encrypt_multiple(pdf_file_path, count):
  assert text0 == text1
 
 
-@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography")
+@pytest.mark.skipif(
+ not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY,
+ reason="No pycryptodome / cryptography",
+)
 def test_aes_decrypt_corrupted_data():
  """Just for robustness"""
  aes = CryptAES(secrets.token_bytes(16))