Skip to content

Commit

Permalink
TST: Compare extracted images against ground truth (#2072)
Browse files Browse the repository at this point in the history
A function `image_similarity` was introduced which quantifies visual similarities of two images via Mean Squared Error (MSE). This way we can compare the extracted images with what we expect.

We cannot make a byte-wise comparison as updates to PIL can change the representation.

The new function helps us to ensure that updates to the pypdf code don't break image extraction.
  • Loading branch information
MartinThoma authored Aug 9, 2023
1 parent aad26dd commit 82e8681
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 14 deletions.
13 changes: 5 additions & 8 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject

from . import get_pdf_from_url
from .test_images import image_similarity

filter_inputs = (
# "", '', """""",
Expand Down Expand Up @@ -386,16 +387,12 @@ def test_rgba():
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
url_png = "https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png"
name_png = "tika-972174_p0-im0.png"
refimg = Image.open(
BytesIO(get_pdf_from_url(url_png, name=name_png))
) # not a pdf but it works
data = reader.pages[0].images[0]
assert ".jp2" in data.name
diff = ImageChops.difference(data.image, refimg)
d = sqrt(
sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()])
) / (diff.size[0] * diff.size[1])
assert d < 0.01
similarity = image_similarity(
data.image, BytesIO(get_pdf_from_url(url_png, name=name_png))
)
assert similarity > 0.99


@pytest.mark.enable_socket()
Expand Down
112 changes: 107 additions & 5 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@

from io import BytesIO
from pathlib import Path
from typing import Union

import pytest
from PIL import Image, ImageChops, ImageDraw

from pypdf import PdfReader
from pypdf._page import PageObject
Expand All @@ -21,6 +23,86 @@
SAMPLE_ROOT = PROJECT_ROOT / "sample-files"


def open_image(path: Union[Path, Image.Image, BytesIO]) -> Image.Image:
if isinstance(path, Image.Image):
img = path
else:
if isinstance(path, Path):
assert path.exists()
with Image.open(path) as img:
img = (
img.copy()
) # Opened image should be copied to avoid issues with file closing
return img


def image_similarity(
path1: Union[Path, Image.Image, BytesIO], path2: Union[Path, Image.Image, BytesIO]
) -> float:
"""
Check image similarity.
A value of "0" means the images are different. A value of 1 means they are
identical. A value above 0.9 means they are almost the same.
This can be used to ensure visual similarity.
"""
# Open the images using Pillow
image1 = open_image(path1)
image2 = open_image(path2)

# Check if the images have the same dimensions
if image1.size != image2.size:
return 0

# Check if the color modes are the same
if image1.mode != image2.mode:
return 0

# Calculate the Mean Squared Error (MSE)
diff = ImageChops.difference(image1, image2)
pixels = list(diff.getdata())

if isinstance(pixels[0], tuple):
mse = sum(sum((c / 255.0) ** 2 for c in p) for p in pixels) / (
len(pixels) * len(pixels[0])
)
else:
mse = sum((p / 255.0) ** 2 for p in pixels) / len(pixels)

return 1 - mse


def test_image_similarity_one():
path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png"
path_b = path_a
assert image_similarity(path_a, path_b) == 1


def test_image_similarity_zero():
path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png"
path_b = SAMPLE_ROOT / "009-pdflatex-geotopo/page-23-Im2.png"
assert image_similarity(path_a, path_b) == 0


def test_image_similarity_mid():
path_a = SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png"
img_b = Image.open(path_a)
draw = ImageDraw.Draw(img_b)

# Fill the rectangle with black color
draw.rectangle([0, 0, 100, 100], fill=(0, 0, 0))
sim1 = image_similarity(path_a, img_b)
assert sim1 > 0.9
assert sim1 > 0
assert sim1 < 1

draw.rectangle([0, 0, 200, 200], fill=(0, 0, 0))
sim2 = image_similarity(path_a, img_b)
assert sim2 < sim1
assert sim2 > 0


@pytest.mark.enable_socket()
def test_image_new_property():
url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf"
Expand Down Expand Up @@ -92,8 +174,30 @@ def test_image_new_property():
"/Im2",
SAMPLE_ROOT / "009-pdflatex-geotopo/page-23-Im2.png",
),
# (SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf", 30, '/Fm22',
# SAMPLE_ROOT / "009-pdflatex-geotopo/page-30-Fm22.png"),
(
SAMPLE_ROOT / "003-pdflatex-image/pdflatex-image.pdf",
0,
"/Im1",
SAMPLE_ROOT / "003-pdflatex-image/page-0-Im1.jpg",
),
(
SAMPLE_ROOT / "018-base64-image/base64image.pdf",
0,
"/QuickPDFImd32aa1ab",
SAMPLE_ROOT / "018-base64-image/page-0-QuickPDFImd32aa1ab.png",
),
(
SAMPLE_ROOT / "019-grayscale-image/grayscale-image.pdf",
0,
"/X0",
SAMPLE_ROOT / "019-grayscale-image/page-0-X0.png",
),
],
ids=[
"009-pdflatex-geotopo/page-23-Im2.png",
"003-pdflatex-image/page-0-Im1.jpg",
"018-base64-image/page-0-QuickPDFImd32aa1ab.png",
"019-grayscale-image/page-0-X0.png",
],
)
@pytest.mark.samples()
Expand All @@ -104,6 +208,4 @@ def test_image_extraction(src, page_index, image_key, expected):
# A little helper for test generation
with open(f"page-{page_index}-{actual_image.name}", "wb") as fp:
fp.write(actual_image.data)
with open(expected, "rb") as fp:
expected_data = fp.read()
assert actual_image.data == expected_data
assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99

0 comments on commit 82e8681

Please sign in to comment.