BUG: TypeError: can't concat str to bytes (#2114)

This was introduced when I removed seemingly unnecessary calls to b_, a helper function that converts Union[bytes, str] to bytes. Root-cause: Too little test coverage + wrong type annotations Caused-by: 3033122 Closes #2111
py-pdf · Aug 24, 2023 · f16f434 · f16f434
1 parent cbeed04
commit f16f434
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 14 deletions.
diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
@@ -43,7 +43,7 @@
 from pypdf._crypt_providers import rc4_decrypt as RC4_decrypt # noqa: N812
 from pypdf._crypt_providers import rc4_encrypt as RC4_encrypt # noqa: N812
 
-from ._utils import logger_warning
+from ._utils import b_, logger_warning
 from .generic import (
  ArrayObject,
  ByteStringObject,
@@ -75,7 +75,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
  elif isinstance(obj, StreamObject):
  obj2 = StreamObject()
  obj2.update(obj)
- obj2._data = self.stmCrypt.encrypt(obj._data)
+ obj2._data = self.stmCrypt.encrypt(b_(obj._data))
  obj = obj2
  elif isinstance(obj, DictionaryObject):
  obj2 = DictionaryObject() # type: ignore
@@ -91,7 +91,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject:
  data = self.strCrypt.decrypt(obj.original_bytes)
  obj = create_string_object(data)
  elif isinstance(obj, StreamObject):
- obj._data = self.stmCrypt.decrypt(obj._data)
+ obj._data = self.stmCrypt.decrypt(b_(obj._data))
  elif isinstance(obj, DictionaryObject):
  for key, value in obj.items():
  obj[key] = self.decrypt_object(value)

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -54,6 +54,7 @@
 from ._utils import (
  StrByteType,
  StreamType,
+ b_,
  deprecate_no_replacement,
  deprecation_no_replacement,
  deprecation_with_replacement,
@@ -1256,7 +1257,7 @@ def _get_object_from_stream(
  assert cast(str, obj_stm["/Type"]) == "/ObjStm"
  # /N is the number of indirect objects in the stream
  assert idx < obj_stm["/N"]
- stream_data = BytesIO(obj_stm.get_data())
+ stream_data = BytesIO(b_(obj_stm.get_data()))
  for i in range(obj_stm["/N"]): # type: ignore
  read_non_whitespace(stream_data)
  stream_data.seek(-1, 1)
@@ -1867,7 +1868,7 @@ def _read_pdf15_xref_stream(
  xrefstream = cast(ContentStream, read_object(stream, self))
  assert cast(str, xrefstream["/Type"]) == "/XRef"
  self.cache_indirect_object(generation, idnum, xrefstream)
- stream_data = BytesIO(xrefstream.get_data())
+ stream_data = BytesIO(b_(xrefstream.get_data()))
  # Index pairs specify the subsections in the dictionary. If
  # none create one subsection that spans everything.
  idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
@@ -2118,7 +2119,7 @@ def xfa(self) -> Optional[Dict[str, Any]]:
  if isinstance(f, IndirectObject):
  field = cast(Optional[EncodedStreamObject], f.get_object())
  if field:
- es = zlib.decompress(field._data)
+ es = zlib.decompress(b_(field._data))
  retval[tag] = es
  return retval
 

diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -41,6 +41,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from ._utils import (
+ b_,
  deprecate_with_replacement,
  logger_warning,
  ord_,
@@ -655,7 +656,7 @@ def decode(
  return tiff_header + data
 
 
-def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject
+def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject
  """
  Decode the stream data based on the specified filters.
 
@@ -682,7 +683,7 @@ def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject
  decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
  if not isinstance(decodparms, (list, tuple)):
  decodparms = (decodparms,)
- data: bytes = stream._data
+ data: bytes = b_(stream._data)
  # If there is not data to decode we should not try to decode the data.
  if data:
  for filter_type, params in zip(filters, decodparms):

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -785,7 +785,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None:
 
 class StreamObject(DictionaryObject):
  def __init__(self) -> None:
- self._data: bytes = b""
+ self._data: Union[bytes, str] = b""
  self.decoded_self: Optional[DecodedStreamObject] = None
 
  def _clone(
@@ -820,7 +820,7 @@ def _clone(
 
  def hash_value_data(self) -> bytes:
  data = super().hash_value_data()
- data += self._data
+ data += b_(self._data)
  return data
 
  @property
@@ -901,13 +901,13 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
  retval[NameObject(SA.FILTER)] = f
  if parms is not None:
  retval[NameObject(SA.DECODE_PARMS)] = parms
- retval._data = FlateDecode.encode(self._data, level)
+ retval._data = FlateDecode.encode(b_(self._data), level)
  return retval
 
 
 class DecodedStreamObject(StreamObject):
  def get_data(self) -> bytes:
- return self._data
+ return b_(self._data)
 
  def set_data(self, data: bytes) -> None:
  self._data = data
@@ -935,7 +935,7 @@ def decodedSelf(self, value: DecodedStreamObject) -> None: # deprecated
  deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0")
  self.decoded_self = value
 
- def get_data(self) -> bytes:
+ def get_data(self) -> Union[bytes, str]:
  from ..filters import decode_stream_data
 
  if self.decoded_self is not None:
@@ -996,7 +996,7 @@ def __init__(
  if isinstance(stream, ArrayObject):
  data = b""
  for s in stream:
- data += s.get_object().get_data()
+ data += b_(s.get_object().get_data())
  if len(data) == 0 or data[-1] != b"\n":
  data += b"\n"
  stream_bytes = BytesIO(data)

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -395,6 +395,11 @@ def test_iss_1142():
  "https://github.com/py-pdf/pypdf/files/9428434/TelemetryTX_EM.pdf",
  "tika-964029.pdf",
  ), # no_ressources
+ (
+ # https://www.itu.int/rec/T-REC-X.25-199610-I/en
+ "https://github.com/py-pdf/pypdf/files/12423313/T-REC-X.25-199610-I.PDF-E.pdf",
+ "T-REC-X.25-199610-I!!PDF-E.pdf",
+ ),
  ],
 )
 def test_extract_text(url, name):