BUG: process Separation ColorSpace

closes #1955
py-pdf · Jul 24, 2023 · 80b5a0d · 80b5a0d
1 parent 74f8175
commit 80b5a0d
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 23 deletions.
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -650,8 +650,12 @@ def _get_imagemode(
  color_space: Union[str, List[Any], Any],
  color_components: int,
  prev_mode: mode_str_type,
-) -> mode_str_type:
- """Returns the image mode not taking into account mask(transparency)"""
+) -> Tuple[mode_str_type, bool]:
+ """
+ Returns
+ Image mode not taking into account mask(transparency)
+ ColorInversion is required (like for some DeviceCMYK)
+ """
  if isinstance(color_space, str):
  pass
  elif not isinstance(color_space, list):
@@ -668,12 +672,16 @@ def _get_imagemode(
  color_space = color_space[1]
  if isinstance(color_space, IndirectObject):
  color_space = color_space.get_object()
- mode2 = _get_imagemode(color_space, color_components, prev_mode)
+ mode2, invert_color = _get_imagemode(color_space, color_components, prev_mode)
  if mode2 in ("RGB", "CMYK"):
  mode2 = "P"
- return mode2
+ return mode2, invert_color
  elif color_space[0] == "/Separation":
  color_space = color_space[2]
+ if isinstance(color_space, IndirectObject):
+ color_space = color_space.get_object()
+ mode2, invert_color = _get_imagemode(color_space, color_components, prev_mode)
+ return mode2, True
  elif color_space[0] == "/DeviceN":
  color_components = len(color_space[1])
  color_space = color_space[2]
@@ -694,7 +702,7 @@ def _get_imagemode(
  or list(mode_map.values())[color_components]
  or prev_mode
  ) # type: ignore
- return mode
+ return mode, mode == "CMYK"
 
 
 def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, Any]:
@@ -724,10 +732,10 @@ def _handle_flate(
  mode: mode_str_type,
  color_space: str,
  colors: int,
- ) -> Tuple[Image.Image, str, str]:
+ ) -> Tuple[Image.Image, str, str, bool]:
  """
  Process image encoded in flateEncode
- Returns img, image_format, extension
+ Returns img, image_format, extension, color inversion
  """
 
  def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
@@ -774,7 +782,7 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
  "P": (0, "", ""),
  "RGB": (3, "P", "RGB"),
  "CMYK": (4, "P", "CMYK"),
- }[_get_imagemode(base, 0, "")]
+ }[_get_imagemode(base, 0, "")[0]]
  except KeyError: # pragma: no cover
  logger_warning(
  f"Base {base} not coded please share the pdf file with pypdf dev team",
@@ -800,30 +808,30 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
  elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
  # see Table 66 - Additional Entries Specific to an ICC Profile
  # Stream Dictionary
- mode2 = _get_imagemode(color_space, colors, mode)
+ mode2 = _get_imagemode(color_space, colors, mode)[0]
  if mode != mode2:
  img = Image.frombytes(
  mode2, size, data
  ) # reloaded as mode may have change
  if mode == "CMYK":
  extension = ".tif"
  image_format = "TIFF"
- return img, image_format, extension
+ return img, image_format, extension, False
 
  def _handle_jpx(
  size: Tuple[int, int],
  data: bytes,
  mode: mode_str_type,
  color_space: str,
  colors: int,
- ) -> Tuple[Image.Image, str, str]:
+ ) -> Tuple[Image.Image, str, str, bool]:
  """
  Process image encoded in flateEncode
- Returns img, image_format, extension
+ Returns img, image_format, extension, inversion
  """
  extension = ".jp2" # mime_type = "image/x-jp2"
  img1 = Image.open(BytesIO(data), formats=("JPEG2000",))
- mode = _get_imagemode(color_space, colors, mode)
+ mode, invert_color = _get_imagemode(color_space, colors, mode)
  if img1.mode == "RGBA" and mode == "RGB":
  mode = "RGBA"
  # we need to convert to the good mode
@@ -840,7 +848,7 @@ def _handle_jpx(
  if img.mode == "CMYK":
  img = img.convert("RGB")
  image_format = "JPEG2000"
- return img, image_format, extension
+ return img, image_format, extension, invert_color
 
  # for error reporting
  if (
@@ -861,9 +869,11 @@ def _handle_jpx(
  # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
  mode: mode_str_type = "RGB"
  if x_object_obj.get("/BitsPerComponent", 8) < 8:
- mode = _get_imagemode(f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, "")
+ mode, invert_color = _get_imagemode(
+ f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, ""
+ )
  else:
- mode = _get_imagemode(
+ mode, invert_color = _get_imagemode(
  color_space,
  2
  if (
@@ -881,7 +891,7 @@ def _handle_jpx(
  filters = x_object_obj.get(SA.FILTER, [None])
  lfilters = filters[-1] if isinstance(filters, list) else filters
  if lfilters == FT.FLATE_DECODE:
- img, image_format, extension = _handle_flate(
+ img, image_format, extension, invert_color = _handle_flate(
  size,
  data,
  mode,
@@ -902,25 +912,33 @@ def _handle_jpx(
  img = Image.open(BytesIO(data), formats=("TIFF", "PNG"))
  elif lfilters == FT.DCT_DECODE:
  img, image_format, extension = Image.open(BytesIO(data)), "JPEG", ".jpg"
+ # invert_color kept unchanged
  elif lfilters == FT.JPX_DECODE:
- img, image_format, extension = _handle_jpx(
+ img, image_format, extension, invert_color = _handle_jpx(
  size, data, mode, color_space, colors
  )
  elif lfilters == FT.CCITT_FAX_DECODE:
- img, image_format, extension = (
+ img, image_format, extension, invert_color = (
  Image.open(BytesIO(data), formats=("TIFF",)),
  "TIFF",
  ".tiff",
+ False,
+ )
+ else:
+ img, image_format, extension, invert_color = (
+ Image.frombytes(mode, size, data),
+ "PNG",
+ ".png",
+ False,
  )
- elif lfilters is None:
- img, image_format, extension = Image.frombytes(mode, size, data), "PNG", ".png"
 
- # CMYK image without decode requires reverting scale (cf p243,2§ last sentence)
+ # CMYK image and other colorspaces without decode
+ # requires reverting scale (cf p243,2§ last sentence)
  decode = x_object_obj.get(
  IA.DECODE,
  ([1.0, 0.0] * len(img.getbands()))
  if (
- (img.mode == "CMYK" or (mode == "CMYK" and img.mode == "L"))
+ (img.mode == "CMYK" or (invert_color and img.mode == "L"))
  and lfilters in (FT.DCT_DECODE, FT.JPX_DECODE)
  )
  else None,

diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -506,3 +506,24 @@ def test_gray_devicen_cmyk():
  diff = ImageChops.difference(data.image, refimg)
  d = sqrt(sum([(a * a) for a in diff.getdata()])) / (diff.size[0] * diff.size[1])
  assert d < 0.001
+
+
+@pytest.mark.enable_socket()
+def test_gray_separation_cmyk():
+ """
+ Cf #1955
+ Gray Image in Separation/RGB : requiring reverse
+ """
+ url = "https://github.com/py-pdf/pypdf/files/12143372/tt.pdf"
+ name = "TestWithSeparationBlack.pdf"
+ reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+ url_png = "https://user-images.githubusercontent.com/4083478/254545494-42df4949-1557-4f2d-acca-6be6e8de1122.png"
+ name_png = "velo.png" # reused
+ refimg = Image.open(
+ BytesIO(get_pdf_from_url(url_png, name=name_png))
+ ) # not a pdf but it works
+ data = reader.pages[0].images[0]
+ assert data.image.mode == "L"
+ diff = ImageChops.difference(data.image, refimg)
+ d = sqrt(sum([(a * a) for a in diff.getdata()])) / (diff.size[0] * diff.size[1])
+ assert d < 0.001