From 49be828c26186c1ae355710098a53591101408bb Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Fri, 8 Dec 2023 07:59:32 +0000 Subject: [PATCH] Handle optional param to resize rasterized image if too large New param: PDF_RASTERIZER_MAX_LENGTH --- README.md | 1 + src/app/engine/rasterizers.py | 34 +++++++++++++++++++++++++++++++++- src/app/settings.py | 1 + 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 84f9308..769e7d2 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ The following list of environment variables are supported: | `PDF_RASTERIZER_THREAD_COUNT` | `3` | Engine | The number of concurrent [Poppler](https://poppler.freedesktop.org/) threads spawned when a worker is rasterizing a PDF. Each thread typically consumes 100% of a CPU core. | | `PDF_RASTERIZER_DPI` | `500` | Engine | The DPI of images generated during the rasterization process. For JPEG's, the default value of `500` typically produces images approximately 1.5MiB to 2MiB in size. | | `PDF_RASTERIZER_FORMAT` | `jpg` | Engine | The format to generate rasterized images in. Supported values are `ppm`, `jpeg` / `jpg`, `png` and `tiff` | +| `PDF_RASTERIZER_MAX_LENGTH` | `0` | Engine | Optional, the maximum size of pixels on longest edge that will be saved. If rasterized image exceeds this it will be resized, maintaining aspect ratio. | | `DLCS_API_ROOT` | `https://api.dlcs.digirati.io` | Engine | The root URI of the API of the target DLCS deployment, without the trailing slash. | | `DLCS_S3_BUCKET_NAME` | `dlcs-composite-images` | Engine | The S3 bucket that the Composite Handler will push rasterized images to, for consumption by the wider DLCS. Both the Composite Handler and the DLCS must have access to this bucket. | | `DLCS_S3_OBJECT_KEY_PREFIX` | `composites` | Engine | The S3 key prefix to use when pushing images to the `DLCS_S3_BUCKET_NAME` - in other words, the folder within the S3 bucket into which images are stored. | diff --git a/src/app/engine/rasterizers.py b/src/app/engine/rasterizers.py index 60e1b65..9584a4f 100644 --- a/src/app/engine/rasterizers.py +++ b/src/app/engine/rasterizers.py @@ -1,9 +1,12 @@ import os +import logging from django.conf import settings from pdf2image import convert_from_path from PIL import Image +logger = logging.Logger(__name__) + Image.MAX_IMAGE_PIXELS = 1000000000 @@ -12,13 +15,14 @@ def __init__(self): self._dpi = settings.PDF_RASTERIZER["dpi"] self._fmt = settings.PDF_RASTERIZER["format"] self._thread_count = settings.PDF_RASTERIZER["thread_count"] + self._max_length = settings.PDF_RASTERIZER["max_length"] def rasterize_pdf(self, subfolder_path): # Typically, pdf2image will write generated images to a temporary path, after # which you can manipulate them. By providing 'output_file' and 'output_folder', # we can skip that second step and make pdf2image write directly to our desired # output folder, using our desired file name pattern. - return convert_from_path( + images = convert_from_path( os.path.join(subfolder_path, "source.pdf"), dpi=self._dpi, fmt=self._fmt, @@ -26,3 +30,31 @@ def rasterize_pdf(self, subfolder_path): output_file="image-", output_folder=subfolder_path, ) + + return self.__rescale(images) + + def __rescale(self, images): + if not self._max_length: + return images + + idx = 0 + for im in images: + w = im.width + h = im.height + filename = im.filename + if max(w, h) == 1: + logger.warning(f"image index {idx} is 1x1 pixel output") + if max(w, h) > self._max_length: + # exceeds max_length so reduce + scale = min(self._max_length / w, self._max_length / h) + scale_w = int(w * scale) + scale_h = int(h * scale) + + logger.info( + f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}" + ) + resized = im.resize((scale_w, scale_h), resample=Image.LANCZOS) + resized.save(filename) + + idx += 1 + return images diff --git a/src/app/settings.py b/src/app/settings.py index 33d52e8..a30117e 100644 --- a/src/app/settings.py +++ b/src/app/settings.py @@ -170,6 +170,7 @@ "thread_count": env("PDF_RASTERIZER_THREAD_COUNT", cast=int, default=3), "format": env("PDF_RASTERIZER_FORMAT", cast=str, default="jpg"), "dpi": env("PDF_RASTERIZER_DPI", cast=int, default=500), + "max_length": env("PDF_RASTERIZER_MAX_LENGTH", cast=int, default=0), } ORIGIN_CONFIG = {"chunk_size": env("ORIGIN_CHUNK_SIZE", cast=int, default=8192)}