Skip to content

Commit

Permalink
Merge pull request #46 from dlcs/feature/handle_large_images
Browse files Browse the repository at this point in the history
Detect 1x1 pixel output and re-rasterized with fallback DPI
  • Loading branch information
donaldgray authored Dec 12, 2023
2 parents 010dc0c + 729d57c commit e031c34
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 27 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ The following list of environment variables are supported:
| `CACHE_URL` | None | API, Engine | The URL of the target cache, in a format acceptable to [django-environ](https://django-environ.readthedocs.io/en/latest/getting-started.html#usage), e.g. `dbcache://app_cache`. |
| `PDF_RASTERIZER_THREAD_COUNT` | `3` | Engine | The number of concurrent [Poppler](https://poppler.freedesktop.org/) threads spawned when a worker is rasterizing a PDF. Each thread typically consumes 100% of a CPU core. |
| `PDF_RASTERIZER_DPI` | `500` | Engine | The DPI of images generated during the rasterization process. For JPEG's, the default value of `500` typically produces images approximately 1.5MiB to 2MiB in size. |
| `PDF_RASTERIZER_FALLBACK_DPI` | `200` | Engine | The DPI to use for images that exceed pdftoppm memory size and produce a 1x1 pixel (see https://github.com/Belval/pdf2image/issues/34) |
| `PDF_RASTERIZER_FORMAT` | `jpg` | Engine | The format to generate rasterized images in. Supported values are `ppm`, `jpeg` / `jpg`, `png` and `tiff` |
| `PDF_RASTERIZER_MAX_LENGTH` | `0` | Engine | Optional, the maximum size of pixels on longest edge that will be saved. If rasterized image exceeds this it will be resized, maintaining aspect ratio. |
| `DLCS_API_ROOT` | `https://api.dlcs.digirati.io` | Engine | The root URI of the API of the target DLCS deployment, without the trailing slash. |
Expand Down
109 changes: 82 additions & 27 deletions src/app/engine/rasterizers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import logging
from enum import Enum

from django.conf import settings
from pdf2image import convert_from_path
Expand All @@ -10,9 +11,16 @@
Image.MAX_IMAGE_PIXELS = 1000000000


class ResizeResult(Enum):
NOOP = 1
RESIZED = 2
SINGLE_PIXEL = 3


class PdfRasterizer:
def __init__(self):
self._dpi = settings.PDF_RASTERIZER["dpi"]
self._fallback_dpi = settings.PDF_RASTERIZER["fallback_dpi"]
self._fmt = settings.PDF_RASTERIZER["format"]
self._thread_count = settings.PDF_RASTERIZER["thread_count"]
self._max_length = settings.PDF_RASTERIZER["max_length"]
Expand All @@ -22,39 +30,86 @@ def rasterize_pdf(self, subfolder_path):
# which you can manipulate them. By providing 'output_file' and 'output_folder',
# we can skip that second step and make pdf2image write directly to our desired
# output folder, using our desired file name pattern.
images = convert_from_path(
os.path.join(subfolder_path, "source.pdf"),
dpi=self._dpi,
pdf_source = os.path.join(subfolder_path, "source.pdf")
images = self.__rasterize(pdf_source, subfolder_path, dpi=self._dpi)
images = self.__validate_rasterized_images(images, pdf_source, subfolder_path)
return images

def __rasterize(
self, pdf_source, subfolder_path, start_page=None, last_page=None, dpi=None
):
# return value from convert_from_path is a list of all images in output directory that have appropriate
# extension and start with output_file. Due to this use a different output_file name for initial rasterizing and
# further page-by-page rasterizing
output_file = "imager-" if start_page else "image-"
return convert_from_path(
pdf_source,
first_page=start_page,
last_page=last_page,
dpi=dpi or self._fallback_dpi,
fmt=self._fmt,
thread_count=self._thread_count,
output_file="image-",
output_file=output_file,
output_folder=subfolder_path,
)

return self.__rescale(images)

def __rescale(self, images):
if not self._max_length:
return images

def __validate_rasterized_images(self, images, pdf_source, subfolder_path):
"""
Validate that rasterized images don't exceed max_size (if set) and that a single 1x1 pixel output has not been
generated. see https://github.com/Belval/pdf2image/issues/34
"""
single_pixel_pages = []
idx = 0
for im in images:
w = im.width
h = im.height
filename = im.filename
if max(w, h) == 1:
logger.warning(f"image index {idx} is 1x1 pixel output")
if max(w, h) > self._max_length:
# exceeds max_length so reduce
scale = min(self._max_length / w, self._max_length / h)
scale_w = int(w * scale)
scale_h = int(h * scale)

logger.info(
f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}"
)
resized = im.resize((scale_w, scale_h), resample=Image.LANCZOS)
resized.save(filename)

res = self.__ensure_image_size(idx, im)
if res == ResizeResult.SINGLE_PIXEL:
single_pixel_pages.append(idx + 1)
idx += 1

if single_pixel_pages:
return self.__rescale_single_page_default_dpi(
pdf_source, subfolder_path, single_pixel_pages, images
)

return images

def __ensure_image_size(self, idx, im: Image):
w = im.width
h = im.height
filename = im.filename
if max(w, h) == 1:
logger.warning(f"image index {idx} is 1x1 pixel output")
return ResizeResult.SINGLE_PIXEL
if self._max_length and max(w, h) > self._max_length:
# exceeds max_length so reduce
scale = min(self._max_length / w, self._max_length / h)
scale_w = int(w * scale)
scale_h = int(h * scale)

logger.info(
f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}"
)
resized = im.resize((scale_w, scale_h), resample=Image.LANCZOS)
resized.save(filename)
return ResizeResult.RESIZED

return ResizeResult.NOOP

def __rescale_single_page_default_dpi(
self, pdf_source, subfolder_path, pages, images
):
count = 0
for p in pages:
idx = p - 1
res = self.__rasterize(
pdf_source, subfolder_path, start_page=p, last_page=p
)
updated_image = res[count]
self.__ensure_image_size(idx, updated_image)

logger.debug(f"re-rasterizing page {p} - {updated_image.filename}")

images[idx] = updated_image
count += 1

return images
1 change: 1 addition & 0 deletions src/app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@
"thread_count": env("PDF_RASTERIZER_THREAD_COUNT", cast=int, default=3),
"format": env("PDF_RASTERIZER_FORMAT", cast=str, default="jpg"),
"dpi": env("PDF_RASTERIZER_DPI", cast=int, default=500),
"fallback_dpi": env("PDF_RASTERIZER_FALLBACK_DPI", cast=int, default=200),
"max_length": env("PDF_RASTERIZER_MAX_LENGTH", cast=int, default=0),
}

Expand Down

0 comments on commit e031c34

Please sign in to comment.