From 729d57c95cdff10665859cac7ef858df171be209 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 11 Dec 2023 16:58:25 +0000 Subject: [PATCH] Detect 1x1 pixel output and re-rasterized with fallback DPI --- README.md | 1 + src/app/engine/rasterizers.py | 109 +++++++++++++++++++++++++--------- src/app/settings.py | 1 + 3 files changed, 84 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 769e7d2..c81ab80 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ The following list of environment variables are supported: | `CACHE_URL` | None | API, Engine | The URL of the target cache, in a format acceptable to [django-environ](https://django-environ.readthedocs.io/en/latest/getting-started.html#usage), e.g. `dbcache://app_cache`. | | `PDF_RASTERIZER_THREAD_COUNT` | `3` | Engine | The number of concurrent [Poppler](https://poppler.freedesktop.org/) threads spawned when a worker is rasterizing a PDF. Each thread typically consumes 100% of a CPU core. | | `PDF_RASTERIZER_DPI` | `500` | Engine | The DPI of images generated during the rasterization process. For JPEG's, the default value of `500` typically produces images approximately 1.5MiB to 2MiB in size. | +| `PDF_RASTERIZER_FALLBACK_DPI` | `200` | Engine | The DPI to use for images that exceed pdftoppm memory size and produce a 1x1 pixel (see https://github.com/Belval/pdf2image/issues/34) | | `PDF_RASTERIZER_FORMAT` | `jpg` | Engine | The format to generate rasterized images in. Supported values are `ppm`, `jpeg` / `jpg`, `png` and `tiff` | | `PDF_RASTERIZER_MAX_LENGTH` | `0` | Engine | Optional, the maximum size of pixels on longest edge that will be saved. If rasterized image exceeds this it will be resized, maintaining aspect ratio. | | `DLCS_API_ROOT` | `https://api.dlcs.digirati.io` | Engine | The root URI of the API of the target DLCS deployment, without the trailing slash. | diff --git a/src/app/engine/rasterizers.py b/src/app/engine/rasterizers.py index 9584a4f..f9fc2bb 100644 --- a/src/app/engine/rasterizers.py +++ b/src/app/engine/rasterizers.py @@ -1,5 +1,6 @@ import os import logging +from enum import Enum from django.conf import settings from pdf2image import convert_from_path @@ -10,9 +11,16 @@ Image.MAX_IMAGE_PIXELS = 1000000000 +class ResizeResult(Enum): + NOOP = 1 + RESIZED = 2 + SINGLE_PIXEL = 3 + + class PdfRasterizer: def __init__(self): self._dpi = settings.PDF_RASTERIZER["dpi"] + self._fallback_dpi = settings.PDF_RASTERIZER["fallback_dpi"] self._fmt = settings.PDF_RASTERIZER["format"] self._thread_count = settings.PDF_RASTERIZER["thread_count"] self._max_length = settings.PDF_RASTERIZER["max_length"] @@ -22,39 +30,86 @@ def rasterize_pdf(self, subfolder_path): # which you can manipulate them. By providing 'output_file' and 'output_folder', # we can skip that second step and make pdf2image write directly to our desired # output folder, using our desired file name pattern. - images = convert_from_path( - os.path.join(subfolder_path, "source.pdf"), - dpi=self._dpi, + pdf_source = os.path.join(subfolder_path, "source.pdf") + images = self.__rasterize(pdf_source, subfolder_path, dpi=self._dpi) + images = self.__validate_rasterized_images(images, pdf_source, subfolder_path) + return images + + def __rasterize( + self, pdf_source, subfolder_path, start_page=None, last_page=None, dpi=None + ): + # return value from convert_from_path is a list of all images in output directory that have appropriate + # extension and start with output_file. Due to this use a different output_file name for initial rasterizing and + # further page-by-page rasterizing + output_file = "imager-" if start_page else "image-" + return convert_from_path( + pdf_source, + first_page=start_page, + last_page=last_page, + dpi=dpi or self._fallback_dpi, fmt=self._fmt, thread_count=self._thread_count, - output_file="image-", + output_file=output_file, output_folder=subfolder_path, ) - return self.__rescale(images) - - def __rescale(self, images): - if not self._max_length: - return images - + def __validate_rasterized_images(self, images, pdf_source, subfolder_path): + """ + Validate that rasterized images don't exceed max_size (if set) and that a single 1x1 pixel output has not been + generated. see https://github.com/Belval/pdf2image/issues/34 + """ + single_pixel_pages = [] idx = 0 for im in images: - w = im.width - h = im.height - filename = im.filename - if max(w, h) == 1: - logger.warning(f"image index {idx} is 1x1 pixel output") - if max(w, h) > self._max_length: - # exceeds max_length so reduce - scale = min(self._max_length / w, self._max_length / h) - scale_w = int(w * scale) - scale_h = int(h * scale) - - logger.info( - f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}" - ) - resized = im.resize((scale_w, scale_h), resample=Image.LANCZOS) - resized.save(filename) - + res = self.__ensure_image_size(idx, im) + if res == ResizeResult.SINGLE_PIXEL: + single_pixel_pages.append(idx + 1) idx += 1 + + if single_pixel_pages: + return self.__rescale_single_page_default_dpi( + pdf_source, subfolder_path, single_pixel_pages, images + ) + + return images + + def __ensure_image_size(self, idx, im: Image): + w = im.width + h = im.height + filename = im.filename + if max(w, h) == 1: + logger.warning(f"image index {idx} is 1x1 pixel output") + return ResizeResult.SINGLE_PIXEL + if self._max_length and max(w, h) > self._max_length: + # exceeds max_length so reduce + scale = min(self._max_length / w, self._max_length / h) + scale_w = int(w * scale) + scale_h = int(h * scale) + + logger.info( + f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}" + ) + resized = im.resize((scale_w, scale_h), resample=Image.LANCZOS) + resized.save(filename) + return ResizeResult.RESIZED + + return ResizeResult.NOOP + + def __rescale_single_page_default_dpi( + self, pdf_source, subfolder_path, pages, images + ): + count = 0 + for p in pages: + idx = p - 1 + res = self.__rasterize( + pdf_source, subfolder_path, start_page=p, last_page=p + ) + updated_image = res[count] + self.__ensure_image_size(idx, updated_image) + + logger.debug(f"re-rasterizing page {p} - {updated_image.filename}") + + images[idx] = updated_image + count += 1 + return images diff --git a/src/app/settings.py b/src/app/settings.py index a30117e..af2ff3d 100644 --- a/src/app/settings.py +++ b/src/app/settings.py @@ -170,6 +170,7 @@ "thread_count": env("PDF_RASTERIZER_THREAD_COUNT", cast=int, default=3), "format": env("PDF_RASTERIZER_FORMAT", cast=str, default="jpg"), "dpi": env("PDF_RASTERIZER_DPI", cast=int, default=500), + "fallback_dpi": env("PDF_RASTERIZER_FALLBACK_DPI", cast=int, default=200), "max_length": env("PDF_RASTERIZER_MAX_LENGTH", cast=int, default=0), }