py-pdf · MartinThoma · Jul 25, 2023 · Jul 21, 2023 · Jul 23, 2023
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -245,10 +245,10 @@ class ASCIIHexDecode:
 
  @staticmethod
  def decode(
- data: str,
+ data: Union[str, bytes],
  decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
  **kwargs: Any,
- ) -> str:
+ ) -> bytes:
  """
  Decode an ASCII-Hex encoded data stream.
 
@@ -268,24 +268,26 @@ def decode(
  if "decodeParms" in kwargs: # deprecated
  deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
  decode_parms = kwargs["decodeParms"] # noqa: F841
- retval = ""
- hex_pair = ""
+ if isinstance(data, str):
+ data = data.encode()
+ retval = b""
+ hex_pair = b""
  index = 0
  while True:
  if index >= len(data):
  raise PdfStreamError("Unexpected EOD in ASCIIHexDecode")
- char = data[index]
- if char == ">":
+ char = data[index : index + 1]
+ if char == b">":
  break
  elif char.isspace():
  index += 1
  continue
  hex_pair += char
  if len(hex_pair) == 2:
- retval += chr(int(hex_pair, base=16))
- hex_pair = ""
+ retval += bytes((int(hex_pair, base=16),))
+ hex_pair = b""
  index += 1
- assert hex_pair == ""
+ assert hex_pair == b""
  return retval
 
 
@@ -852,6 +854,8 @@ def _handle_jpx(
 
  size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
  data = x_object_obj.get_data() # type: ignore
+ if isinstance(data, str): # pragma: no cover
+ data = data.encode()
  colors = x_object_obj.get("/Colors", 1)
  color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
  if (
@@ -912,7 +916,7 @@ def _handle_jpx(
  "TIFF",
  ".tiff",
  )
- elif lfilters is None:
+ else:
  img, image_format, extension = Image.frombytes(mode, size, data), "PNG", ".png"
 
  # CMYK image without decode requires reverting scale (cf p243,2§ last sentence)

diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -84,26 +84,26 @@ def test_flate_decode_decompress_with_array_params(params):
 @pytest.mark.parametrize(
  ("data", "expected"),
  [
- (">", ""),
+ (">", b""),
  (
  "6162636465666768696a6b6c6d6e6f707172737475767778797a>",
- string.ascii_lowercase,
+ string.ascii_lowercase.encode(),
  ),
  (
  "4142434445464748494a4b4c4d4e4f505152535455565758595a>",
- string.ascii_uppercase,
+ string.ascii_uppercase.encode(),
  ),
  (
  "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464748494a4b4c4d4e4f505152535455565758595a>",
- string.ascii_letters,
+ string.ascii_letters.encode(),
  ),
- ("30313233343536373839>", string.digits),
+ ("30313233343536373839>", string.digits.encode()),
  (
  "3 031323334353637 3839>",
- string.digits,
+ string.digits.encode(),
  ), # Same as previous, but whitespaced
- ("30313233343536373839616263646566414243444546>", string.hexdigits),
- ("20090a0d0b0c>", string.whitespace),
+ ("30313233343536373839616263646566414243444546>", string.hexdigits.encode()),
+ ("20090a0d0b0c>", string.whitespace.encode()),
  ],
  ids=[
  "empty",
@@ -135,6 +135,19 @@ def test_ascii_hex_decode_missing_eod():
  assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode"
 
 
+@pytest.mark.enable_socket()
+def test_decode_ahx():
+ """
+ Cf #1979
+ Gray Image in CMYK : requiring reverse
+ """
+ url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf"
+ name = "NewJersey.pdf"
+ reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+ for p in reader.pages:
+ _ = list(p.images.keys())
+
+
 @pytest.mark.xfail()
 def test_ascii85decode_with_overflow():
  inputs = (