diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index ef7affc74..22e7cf864 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -43,7 +43,7 @@ from pypdf._crypt_providers import rc4_decrypt as RC4_decrypt # noqa: N812 from pypdf._crypt_providers import rc4_encrypt as RC4_encrypt # noqa: N812 -from ._utils import logger_warning +from ._utils import b_, logger_warning from .generic import ( ArrayObject, ByteStringObject, @@ -75,7 +75,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject: elif isinstance(obj, StreamObject): obj2 = StreamObject() obj2.update(obj) - obj2._data = self.stmCrypt.encrypt(obj._data) + obj2._data = self.stmCrypt.encrypt(b_(obj._data)) obj = obj2 elif isinstance(obj, DictionaryObject): obj2 = DictionaryObject() # type: ignore @@ -91,7 +91,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject: data = self.strCrypt.decrypt(obj.original_bytes) obj = create_string_object(data) elif isinstance(obj, StreamObject): - obj._data = self.stmCrypt.decrypt(obj._data) + obj._data = self.stmCrypt.decrypt(b_(obj._data)) elif isinstance(obj, DictionaryObject): for key, value in obj.items(): obj[key] = self.decrypt_object(value) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 062f140a3..a4a422de8 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -54,6 +54,7 @@ from ._utils import ( StrByteType, StreamType, + b_, deprecate_no_replacement, deprecation_no_replacement, deprecation_with_replacement, @@ -1256,7 +1257,7 @@ def _get_object_from_stream( assert cast(str, obj_stm["/Type"]) == "/ObjStm" # /N is the number of indirect objects in the stream assert idx < obj_stm["/N"] - stream_data = BytesIO(obj_stm.get_data()) + stream_data = BytesIO(b_(obj_stm.get_data())) for i in range(obj_stm["/N"]): # type: ignore read_non_whitespace(stream_data) stream_data.seek(-1, 1) @@ -1867,7 +1868,7 @@ def _read_pdf15_xref_stream( xrefstream = cast(ContentStream, read_object(stream, self)) assert cast(str, xrefstream["/Type"]) == "/XRef" self.cache_indirect_object(generation, idnum, xrefstream) - stream_data = BytesIO(xrefstream.get_data()) + stream_data = BytesIO(b_(xrefstream.get_data())) # Index pairs specify the subsections in the dictionary. If # none create one subsection that spans everything. idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) @@ -2118,7 +2119,7 @@ def xfa(self) -> Optional[Dict[str, Any]]: if isinstance(f, IndirectObject): field = cast(Optional[EncodedStreamObject], f.get_object()) if field: - es = zlib.decompress(field._data) + es = zlib.decompress(b_(field._data)) retval[tag] = es return retval diff --git a/pypdf/filters.py b/pypdf/filters.py index ccd8c2bdf..f0ce4f2a2 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -41,6 +41,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast from ._utils import ( + b_, deprecate_with_replacement, logger_warning, ord_, @@ -655,7 +656,7 @@ def decode( return tiff_header + data -def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject +def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject """ Decode the stream data based on the specified filters. @@ -682,7 +683,7 @@ def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters)) if not isinstance(decodparms, (list, tuple)): decodparms = (decodparms,) - data: bytes = stream._data + data: bytes = b_(stream._data) # If there is not data to decode we should not try to decode the data. if data: for filter_type, params in zip(filters, decodparms): diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 2daa98354..3e6f27c9e 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -785,7 +785,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None: class StreamObject(DictionaryObject): def __init__(self) -> None: - self._data: bytes = b"" + self._data: Union[bytes, str] = b"" self.decoded_self: Optional[DecodedStreamObject] = None def _clone( @@ -820,7 +820,7 @@ def _clone( def hash_value_data(self) -> bytes: data = super().hash_value_data() - data += self._data + data += b_(self._data) return data @property @@ -901,13 +901,13 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject": retval[NameObject(SA.FILTER)] = f if parms is not None: retval[NameObject(SA.DECODE_PARMS)] = parms - retval._data = FlateDecode.encode(self._data, level) + retval._data = FlateDecode.encode(b_(self._data), level) return retval class DecodedStreamObject(StreamObject): def get_data(self) -> bytes: - return self._data + return b_(self._data) def set_data(self, data: bytes) -> None: self._data = data @@ -935,7 +935,7 @@ def decodedSelf(self, value: DecodedStreamObject) -> None: # deprecated deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0") self.decoded_self = value - def get_data(self) -> bytes: + def get_data(self) -> Union[bytes, str]: from ..filters import decode_stream_data if self.decoded_self is not None: @@ -996,7 +996,7 @@ def __init__( if isinstance(stream, ArrayObject): data = b"" for s in stream: - data += s.get_object().get_data() + data += b_(s.get_object().get_data()) if len(data) == 0 or data[-1] != b"\n": data += b"\n" stream_bytes = BytesIO(data) diff --git a/tests/test_page.py b/tests/test_page.py index d1f6f3fcb..fb1c989f3 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -395,6 +395,11 @@ def test_iss_1142(): "https://github.com/py-pdf/pypdf/files/9428434/TelemetryTX_EM.pdf", "tika-964029.pdf", ), # no_ressources + ( + # https://www.itu.int/rec/T-REC-X.25-199610-I/en + "https://github.com/py-pdf/pypdf/files/12423313/T-REC-X.25-199610-I.PDF-E.pdf", + "T-REC-X.25-199610-I!!PDF-E.pdf", + ), ], ) def test_extract_text(url, name):