TST: Compare extracted images against ground truth (#2072)

A function `image_similarity` was introduced which quantifies visual similarities of two images via Mean Squared Error (MSE). This way we can compare the extracted images with what we expect. We cannot make a byte-wise comparison as updates to PIL can change the representation. The new function helps us to ensure that updates to the pypdf code don't break image extraction.
py-pdf · Aug 9, 2023 · 82e8681 · 82e8681
1 parent aad26dd
commit 82e8681
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 14 deletions.
diff --git a/sample-files b/sample-files
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -22,6 +22,7 @@
 from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject
 
 from . import get_pdf_from_url
+from .test_images import image_similarity
 
 filter_inputs = (
  # "", '', """""",
@@ -386,16 +387,12 @@ def test_rgba():
  reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
  url_png = "https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png"
  name_png = "tika-972174_p0-im0.png"
- refimg = Image.open(
- BytesIO(get_pdf_from_url(url_png, name=name_png))
- ) # not a pdf but it works
  data = reader.pages[0].images[0]
  assert ".jp2" in data.name
- diff = ImageChops.difference(data.image, refimg)
- d = sqrt(
- sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()])
- ) / (diff.size[0] * diff.size[1])
- assert d < 0.01
+ similarity = image_similarity(
+ data.image, BytesIO(get_pdf_from_url(url_png, name=name_png))
+ )
+ assert similarity > 0.99
 
 
 @pytest.mark.enable_socket()

diff --git a/tests/test_images.py b/tests/test_images.py
@@ -7,8 +7,10 @@
 
 from io import BytesIO
 from pathlib import Path
+from typing import Union
 
 import pytest
+from PIL import Image, ImageChops, ImageDraw
 
 from pypdf import PdfReader
 from pypdf._page import PageObject
@@ -21,6 +23,86 @@
 SAMPLE_ROOT = PROJECT_ROOT / "sample-files"
 
 
+def open_image(path: Union[Path, Image.Image, BytesIO]) -> Image.Image:
+ if isinstance(path, Image.Image):
+ img = path
+ else:
+ if isinstance(path, Path):
+ assert path.exists()
+ with Image.open(path) as img:
+ img = (
+ img.copy()
+ ) # Opened image should be copied to avoid issues with file closing
+ return img
+
+
+def image_similarity(
+ path1: Union[Path, Image.Image, BytesIO], path2: Union[Path, Image.Image, BytesIO]
+) -> float:
+ """
+ Check image similarity.
+
+ A value of "0" means the images are different. A value of 1 means they are
+ identical. A value above 0.9 means they are almost the same.
+
+ This can be used to ensure visual similarity.
+ """
+ # Open the images using Pillow
+ image1 = open_image(path1)
+ image2 = open_image(path2)
+
+ # Check if the images have the same dimensions
+ if image1.size != image2.size:
+ return 0
+
+ # Check if the color modes are the same
+ if image1.mode != image2.mode:
+ return 0
+
+ # Calculate the Mean Squared Error (MSE)
+ diff = ImageChops.difference(image1, image2)
+ pixels = list(diff.getdata())
+
+ if isinstance(pixels[0], tuple):
+ mse = sum(sum((c / 255.0) ** 2 for c in p) for p in pixels) / (
+ len(pixels) * len(pixels[0])
+ )
+ else:
+ mse = sum((p / 255.0) ** 2 for p in pixels) / len(pixels)
+
+ return 1 - mse
+
+
+def test_image_similarity_one():
+ path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png"
+ path_b = path_a
+ assert image_similarity(path_a, path_b) == 1
+
+
+def test_image_similarity_zero():
+ path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png"
+ path_b = SAMPLE_ROOT / "009-pdflatex-geotopo/page-23-Im2.png"
+ assert image_similarity(path_a, path_b) == 0
+
+
+def test_image_similarity_mid():
+ path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png"
+ img_b = Image.open(path_a)
+ draw = ImageDraw.Draw(img_b)
+
+ # Fill the rectangle with black color
+ draw.rectangle([0, 0, 100, 100], fill=(0, 0, 0))
+ sim1 = image_similarity(path_a, img_b)
+ assert sim1 > 0.9
+ assert sim1 > 0
+ assert sim1 < 1
+
+ draw.rectangle([0, 0, 200, 200], fill=(0, 0, 0))
+ sim2 = image_similarity(path_a, img_b)
+ assert sim2 < sim1
+ assert sim2 > 0
+
+
 @pytest.mark.enable_socket()
 def test_image_new_property():
  url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf"
@@ -92,8 +174,30 @@ def test_image_new_property():
  "/Im2",
  SAMPLE_ROOT / "009-pdflatex-geotopo/page-23-Im2.png",
  ),
- # (SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf", 30, '/Fm22',
- # SAMPLE_ROOT / "009-pdflatex-geotopo/page-30-Fm22.png"),
+ (
+ SAMPLE_ROOT / "003-pdflatex-image/pdflatex-image.pdf",
+ 0,
+ "/Im1",
+ SAMPLE_ROOT / "003-pdflatex-image/page-0-Im1.jpg",
+ ),
+ (
+ SAMPLE_ROOT / "018-base64-image/base64image.pdf",
+ 0,
+ "/QuickPDFImd32aa1ab",
+ SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png",
+ ),
+ (
+ SAMPLE_ROOT / "019-grayscale-image/grayscale-image.pdf",
+ 0,
+ "/X0",
+ SAMPLE_ROOT / "019-grayscale-image/page-0-X0.png",
+ ),
+ ],
+ ids=[
+ "009-pdflatex-geotopo/page-23-Im2.png",
+ "003-pdflatex-image/page-0-Im1.jpg",
+ "018-base64-image/page-0-QuickPDFImd32aa1ab.png",
+ "019-grayscale-image/page-0-X0.png",
  ],
 )
 @pytest.mark.samples()
@@ -104,6 +208,4 @@ def test_image_extraction(src, page_index, image_key, expected):
  # A little helper for test generation
  with open(f"page-{page_index}-{actual_image.name}", "wb") as fp:
  fp.write(actual_image.data)
- with open(expected, "rb") as fp:
- expected_data = fp.read()
- assert actual_image.data == expected_data
+ assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99