diff --git a/tests/test_filters.py b/tests/test_filters.py index 63f84d628..5c374e40d 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -19,9 +19,10 @@ CCITTFaxDecode, FlateDecode, ) -from pypdf.generic import ArrayObject, DictionaryObject, NumberObject +from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject from . import get_pdf_from_url +from .test_images import image_similarity filter_inputs = ( # "", '', """""", @@ -287,6 +288,31 @@ def test_issue_1737(): reader.pages[0]["/Resources"]["/XObject"]["/Im2"].get_data() +@pytest.mark.enable_socket() +def test_pa_image_extraction(): + """ + PNG images with PA mode can be extracted. + + This is a regression test for issue #1801 + """ + url = "https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf" + name = "issue-1801.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + + page0 = reader.pages[0] + images = page0.images + assert len(images) == 1 + assert images[0].name == "Im1.png" + + # Ensure visual appearence + data = get_pdf_from_url( + "https://user-images.githubusercontent.com/" + "1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png", + "issue-1801.png", + ) + assert data == images[0].data + + @pytest.mark.enable_socket() def test_1bit_image_extraction(): """Cf issue #1814""" @@ -313,6 +339,45 @@ def test_png_transparency_reverse(): # assert list(img.getdata()) == list(refimg.getdata()) +@pytest.mark.enable_socket() +def test_iss1787(): + """Cf issue #1787""" + url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" + name = "pdf_font_garbled.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + url_png = "https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png" + name_png = "watermark1.png" + refimg = Image.open( + BytesIO(get_pdf_from_url(url_png, name=name_png)) + ) # not a pdf but it works + data = reader.pages[0].images[0] + img = Image.open(BytesIO(data.data)) + assert ".png" in data.name + assert list(img.getdata()) == list(refimg.getdata()) + obj = data.indirect_reference.get_object() + obj["/DecodeParms"][NameObject("/Columns")] = NumberObject(1000) + obj.decoded_self = None + with pytest.raises(PdfReadError) as exc: + reader.pages[0].images[0] + assert exc.value.args[0] == "Image data is not rectangular" + + +@pytest.mark.enable_socket() +def test_rgba(): + """Decode rgb with transparency""" + url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" + name = "tika-972174.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + url_png = "https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png" + name_png = "tika-972174_p0-im0.png" + data = reader.pages[0].images[0] + assert ".jp2" in data.name + assert ( + image_similarity(data.image, BytesIO(get_pdf_from_url(url_png, name=name_png))) + > 0.99 + ) + + @pytest.mark.enable_socket() def test_cmyk(): """Decode cmyk""" diff --git a/tests/test_images.py b/tests/test_images.py index 03b820cf8..71b750b20 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -13,8 +13,6 @@ from pypdf import PdfReader from pypdf._page import PageObject -from pypdf.errors import PdfReadError -from pypdf.generic import NameObject, NumberObject from . import get_pdf_from_url @@ -170,54 +168,6 @@ def test_image_extraction(src, page_index, image_key, expected): assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 -@pytest.mark.enable_socket() -def test_pa_image_extraction(): - """ - PNG images with PA mode can be extracted. - - This is a regression test for issue #1801 - """ - url = "https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf" - name = "issue-1801.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - - page0 = reader.pages[0] - images = page0.images - assert len(images) == 1 - assert images[0].name == "Im1.png" - - # Ensure visual appearence - data = get_pdf_from_url( - "https://user-images.githubusercontent.com/" - "1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png", - "issue-1801.png", - ) - assert data == images[0].data - - -@pytest.mark.enable_socket() -def test_iss1787(): - """Cf issue #1787""" - url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" - name = "pdf_font_garbled.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png" - name_png = "watermark1.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works - data = reader.pages[0].images[0] - img = Image.open(BytesIO(data.data)) - assert ".png" in data.name - assert list(img.getdata()) == list(refimg.getdata()) - obj = data.indirect_reference.get_object() - obj["/DecodeParms"][NameObject("/Columns")] = NumberObject(1000) - obj.decoded_self = None - with pytest.raises(PdfReadError) as exc: - reader.pages[0].images[0] - assert exc.value.args[0] == "Image data is not rectangular" - - @pytest.mark.enable_socket() def test_tiff_predictor(): """Decode Tiff Predictor 2 Images""" @@ -233,19 +183,3 @@ def test_tiff_predictor(): img = Image.open(BytesIO(data.data)) assert ".png" in data.name assert list(img.getdata()) == list(refimg.getdata()) - - -@pytest.mark.enable_socket() -def test_rgba(): - """Decode rgb with transparency""" - url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" - name = "tika-972174.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - url_png = "https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png" - name_png = "tika-972174_p0-im0.png" - data = reader.pages[0].images[0] - assert ".jp2" in data.name - assert ( - image_similarity(data.image, BytesIO(get_pdf_from_url(url_png, name=name_png))) - > 0.99 - )