Skip to content

Commit

Permalink
Merge pull request #42 from dlcs/feature/set_dpi
Browse files Browse the repository at this point in the history
Handle optional param to resize rasterized image if too large
  • Loading branch information
donaldgray authored Dec 11, 2023
2 parents 68f9a32 + 49be828 commit 010dc0c
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ The following list of environment variables are supported:
| `PDF_RASTERIZER_THREAD_COUNT` | `3` | Engine | The number of concurrent [Poppler](https://poppler.freedesktop.org/) threads spawned when a worker is rasterizing a PDF. Each thread typically consumes 100% of a CPU core. |
| `PDF_RASTERIZER_DPI` | `500` | Engine | The DPI of images generated during the rasterization process. For JPEG's, the default value of `500` typically produces images approximately 1.5MiB to 2MiB in size. |
| `PDF_RASTERIZER_FORMAT` | `jpg` | Engine | The format to generate rasterized images in. Supported values are `ppm`, `jpeg` / `jpg`, `png` and `tiff` |
| `PDF_RASTERIZER_MAX_LENGTH` | `0` | Engine | Optional, the maximum size of pixels on longest edge that will be saved. If rasterized image exceeds this it will be resized, maintaining aspect ratio. |
| `DLCS_API_ROOT` | `https://api.dlcs.digirati.io` | Engine | The root URI of the API of the target DLCS deployment, without the trailing slash. |
| `DLCS_S3_BUCKET_NAME` | `dlcs-composite-images` | Engine | The S3 bucket that the Composite Handler will push rasterized images to, for consumption by the wider DLCS. Both the Composite Handler and the DLCS must have access to this bucket. |
| `DLCS_S3_OBJECT_KEY_PREFIX` | `composites` | Engine | The S3 key prefix to use when pushing images to the `DLCS_S3_BUCKET_NAME` - in other words, the folder within the S3 bucket into which images are stored. |
Expand Down
34 changes: 33 additions & 1 deletion src/app/engine/rasterizers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
import logging

from django.conf import settings
from pdf2image import convert_from_path
from PIL import Image

logger = logging.Logger(__name__)

Image.MAX_IMAGE_PIXELS = 1000000000


Expand All @@ -12,17 +15,46 @@ def __init__(self):
self._dpi = settings.PDF_RASTERIZER["dpi"]
self._fmt = settings.PDF_RASTERIZER["format"]
self._thread_count = settings.PDF_RASTERIZER["thread_count"]
self._max_length = settings.PDF_RASTERIZER["max_length"]

def rasterize_pdf(self, subfolder_path):
# Typically, pdf2image will write generated images to a temporary path, after
# which you can manipulate them. By providing 'output_file' and 'output_folder',
# we can skip that second step and make pdf2image write directly to our desired
# output folder, using our desired file name pattern.
return convert_from_path(
images = convert_from_path(
os.path.join(subfolder_path, "source.pdf"),
dpi=self._dpi,
fmt=self._fmt,
thread_count=self._thread_count,
output_file="image-",
output_folder=subfolder_path,
)

return self.__rescale(images)

def __rescale(self, images):
if not self._max_length:
return images

idx = 0
for im in images:
w = im.width
h = im.height
filename = im.filename
if max(w, h) == 1:
logger.warning(f"image index {idx} is 1x1 pixel output")
if max(w, h) > self._max_length:
# exceeds max_length so reduce
scale = min(self._max_length / w, self._max_length / h)
scale_w = int(w * scale)
scale_h = int(h * scale)

logger.info(
f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}"
)
resized = im.resize((scale_w, scale_h), resample=Image.LANCZOS)
resized.save(filename)

idx += 1
return images
1 change: 1 addition & 0 deletions src/app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@
"thread_count": env("PDF_RASTERIZER_THREAD_COUNT", cast=int, default=3),
"format": env("PDF_RASTERIZER_FORMAT", cast=str, default="jpg"),
"dpi": env("PDF_RASTERIZER_DPI", cast=int, default=500),
"max_length": env("PDF_RASTERIZER_MAX_LENGTH", cast=int, default=0),
}

ORIGIN_CONFIG = {"chunk_size": env("ORIGIN_CHUNK_SIZE", cast=int, default=8192)}
Expand Down

0 comments on commit 010dc0c

Please sign in to comment.