From c864f4e903084be77a408d8d3cce46b128df9f68 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 15 Jul 2023 17:26:39 +0200 Subject: [PATCH] BUG: Process 2bits and 4bits images (#1967) Closes #1954 --- pypdf/filters.py | 48 ++++++++++++++++++++++++++++++++----------- tests/test_filters.py | 19 +++++++++++++++++ 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index efa22d907..4c05bdec8 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -641,7 +641,9 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # deprecated return decode_stream_data(stream) -mode_str_type: TypeAlias = Literal["", "1", "RGB", "P", "L", "RGBA", "CMYK"] +mode_str_type: TypeAlias = Literal[ + "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK" +] def _get_imagemode( @@ -673,6 +675,8 @@ def _get_imagemode( mode_map = { "1bit": "1", # 0 will be used for 1 bit + "2bit": "2bits", # 2 bits images + "4bit": "4bits", # 4 bits "/DeviceGray": "L", "palette": "P", # reserved for color_components alignment "/DeviceRGB": "RGB", @@ -718,6 +722,24 @@ def _handle_flate( Process image encoded in flateEncode Returns img, image_format, extension """ + + def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: + mask = (2 << bits) - 1 + nbuff = bytearray(size[0] * size[1]) + by = 0 + bit = 8 - bits + for y in range(size[1]): + if (bit != 0) and (bit != 8 - bits): + by += 1 + bit = 8 - bits + for x in range(size[0]): + nbuff[y * size[0] + x] = (data[by] >> bit) & mask + bit -= bits + if bit < 0: + by += 1 + bit = 8 - bits + return bytes(nbuff) + extension = ".png" # mime_type = "image/png" lookup: Any base: Any @@ -726,6 +748,12 @@ def _handle_flate( color_space, base, hival, lookup = ( value.get_object() for value in color_space ) + if mode == "2bits": + mode = "P" + data = bits2byte(data, size, 2) + elif mode == "4bits": + mode = "P" + data = bits2byte(data, size, 4) img = Image.frombytes(mode, size, data) if color_space == "/Indexed": from .generic import ByteStringObject @@ -820,8 +848,8 @@ def _handle_jpx( ): # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes mode: mode_str_type = "RGB" - if x_object_obj.get("/BitsPerComponent", 8) == 1: - mode = _get_imagemode("1bit", 0, "") + if x_object_obj.get("/BitsPerComponent", 8) < 8: + mode = _get_imagemode(f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, "") else: mode = _get_imagemode( color_space, @@ -842,7 +870,11 @@ def _handle_jpx( lfilters = filters[-1] if isinstance(filters, list) else filters if lfilters == FT.FLATE_DECODE: img, image_format, extension = _handle_flate( - size, data, mode, color_space, colors + size, + data, + mode, + color_space, + colors, ) elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE): # I'm not sure if the following logic is correct. @@ -898,14 +930,6 @@ def _handle_jpx( # TODO : implement mask if alpha.mode != "L": alpha = alpha.convert("L") - scale = x_object_obj[IA.S_MASK].get("/Decode", [0.0, 1.0]) - if (scale[1] - scale[0]) != 1.0: - alpha = alpha.point( - [ - round(255.0 * (v / 255.0 * (scale[1] - scale[0]) + scale[0])) - for v in range(256) - ] - ) if img.mode == "P": img = img.convert("RGB") img.putalpha(alpha) diff --git a/tests/test_filters.py b/tests/test_filters.py index 9e0acef5c..bcb79ba45 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -438,3 +438,22 @@ def test_cascaded_filters_images(): for p in reader.pages: for i in p.images: _ = i.name, i.image + + +@pytest.mark.enable_socket() +def test_2bits_image(): + """From #1954, test with 2bits image. TODO: 4bits also""" + url = "https://github.com/py-pdf/pypdf/files/12050253/tt.pdf" + name = "paid.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + url_png = "https://user-images.githubusercontent.com/4083478/253568117-ca95cc85-9dea-4145-a5e0-032f1c1aa322.png" + name_png = "Paid.png" + refimg = Image.open( + BytesIO(get_pdf_from_url(url_png, name=name_png)) + ) # not a pdf but it works + data = reader.pages[0].images[0] + diff = ImageChops.difference(data.image, refimg) + d = sqrt( + sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()]) + ) / (diff.size[0] * diff.size[1]) + assert d < 0.01