Removal of Ghostscript to use qpdf and tesseract directly (#2338)

* navbar fix multi tool and compress location * release notes and ghostscript removal * cleanups * formatting * update docs * more * more * docs * release bump * Hardening suggestions for Stirling-PDF / ghostscript (#2339) * Protect `readLine()` against DoS * Sanitized user-provided file names in HTTP multipart uploads --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --------- Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
Stirling-Tools · Nov 26, 2024 · 833b3c4 · 833b3c4
1 parent 654bc94
commit 833b3c4
Show file tree

Hide file tree

Showing 69 changed files with 1,112 additions and 671 deletions.
diff --git a/DeveloperGuide.md b/DeveloperGuide.md
@@ -11,7 +11,7 @@ Stirling-PDF is built using:
 - Spring Boot + Thymeleaf
 - PDFBox
 - LibreOffice
-- OcrMyPdf
+- qpdf
 - HTML, CSS, JavaScript
 - Docker
 - PDF.js
@@ -243,7 +243,7 @@ To run Stirling-PDF locally:
 
 Important notes:
 
-- Local testing doesn't include features that depend on external tools like OCRmyPDF, LibreOffice, or Python scripts.
+- Local testing doesn't include features that depend on external tools like qpdf, LibreOffice, or Python scripts.
 - There are currently no automated unit tests. All testing is done manually through the UI or API calls. (You are welcome to add JUnits!)
 - Always verify your changes in the full Docker environment before submitting pull requests, as some integrations and features will only work in the complete setup.
 

diff --git a/Dockerfile b/Dockerfile
@@ -30,6 +30,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
         tini \
         bash \
         curl \
+        qpdf \
         shadow \
         su-exec \
         openssl \
@@ -40,7 +41,6 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
 # pdftohtml
         poppler-utils \
 # OCR MY PDF (unpaper for descew and other advanced features)
-        ocrmypdf \
         tesseract-ocr-data-eng \
 # CV
         py3-opencv \

diff --git a/Dockerfile-fat b/Dockerfile-fat
@@ -55,7 +55,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
 # pdftohtml
         poppler-utils \
 # OCR MY PDF (unpaper for descew and other advanced featues)
-        ocrmypdf \
+        qpdf \
         tesseract-ocr-data-eng \
         font-terminus font-dejavu font-noto font-noto-cjk font-awesome font-noto-extra \
 # CV

diff --git a/Endpoint-groups.md b/Endpoint-groups.md
@@ -1,4 +1,4 @@
-| Operation           | PageOps | Convert | Security | Other | CLI | Python | OpenCV | LibreOffice | OCRmyPDF | Java | Javascript | Unoconv | Ghostscript |
+| Operation           | PageOps | Convert | Security | Other | CLI | Python | OpenCV | LibreOffice | qpdf | Java | Javascript | Unoconv | tesseract |
 | ------------------- | ------- | ------- | -------- | ----- | --- | ------ | ------ | ----------- | -------- | ---- | ---------- | ------- | ----------- |
 | adjust-contrast     | ✔️       |         |          |       |     |        |        |             |          |      | ✔️          |         |             |
 | auto-split-pdf      | ✔️       |         |          |       |     |        |        |             |          | ✔️    |            |         |             |
@@ -16,7 +16,7 @@
 | img-to-pdf          |         | ✔️       |          |       |     |        |        |             |          | ✔️    |            |         |             |
 | pdf-to-html         |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            |         |             |
 | pdf-to-img          |         | ✔️       |          |       |     | ✔️      |        |             |          | ✔️    |            |         |             |
-| pdf-to-pdfa         |         | ✔️       |          |       | ✔️   |        |        |             | ✔️        |      |            |         | ✔️           |
+| pdf-to-pdfa         |         | ✔️       |          |       | ✔️   |        |        |             | ✔️        |      |            |         |            |
 | pdf-to-markdown     |         | ✔️       |          |       |     |        |        |             |          | ✔️    |            |         |             |
 | pdf-to-presentation |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            |         |             |
 | pdf-to-text         |         | ✔️       |          |       | ✔️   |        |        | ✔️           |          |      |            |         |             |
@@ -34,13 +34,13 @@
 | auto-rename         |         |         |          | ✔️     |     |        |        |             |          | ✔️    |            |         |             |
 | change-metadata     |         |         |          | ✔️     |     |        |        |             |          | ✔️    |            |         |             |
 | compare             |         |         |          | ✔️     |     |        |        |             |          |      | ✔️          |         |             |
-| compress-pdf        |         |         |          | ✔️     | ✔️   |        |        |             | ✔️        |      |            |         | ✔️           |
+| compress-pdf        |         |         |          | ✔️     | ✔️   |        |        |             | ✔️        |      |            |         |            |
 | extract-image-scans |         |         |          | ✔️     | ✔️   | ✔️      | ✔️      |             |          |      |            |         |             |
 | extract-images      |         |         |          | ✔️     |     |        |        |             |          | ✔️    |            |         |             |
 | flatten             |         |         |          | ✔️     |     |        |        |             |          |      | ✔️          |         |             |
 | get-info-on-pdf     |         |         |          | ✔️     |     |        |        |             |          | ✔️    |            |         |             |
-| ocr-pdf             |         |         |          | ✔️     | ✔️   |        |        |             | ✔️        |      |            |         |             |
+| ocr-pdf             |         |         |          | ✔️     | ✔️   |        |        |             |         |      |            |         |    ✔        |
 | remove-blanks       |         |         |          | ✔️     | ✔️   | ✔️      | ✔️      |             |          |      |            |         |             |
-| repair              |         |         |          | ✔️     | ✔️   |        |        | ✔️           |          |      |            |         | ✔️           |
+| repair              |         |         |          | ✔️     | ✔️   |        |        | ✔️           |   ✔       |      |            |         |            |
 | show-javascript     |         |         |          | ✔️     |     |        |        |             |          |      | ✔️          |         |             |
 | sign                |         |         |          | ✔️     |     |        |        |             |          |      | ✔️          |         |             |
diff --git a/HowToUseOCR.md b/HowToUseOCR.md
@@ -8,7 +8,7 @@ The paths have changed for the tessdata locations on new Docker images. Please u
 
 ## How does the OCR Work
 
-Stirling-PDF uses [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF), which in turn uses Tesseract for its text recognition. All credit goes to them for this awesome work!
+Stirling-PDF uses Tesseract for its text recognition. All credit goes to them for this awesome work!
 
 ## Language Packs
 
@@ -52,8 +52,6 @@ Add the following to your existing Docker run command:
 
 ### Non-Docker Setup
 
-If you are not using Docker, you need to install the OCR components, including the `ocrmypdf` app. You can see the [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html).
-
 For Debian-based systems, install languages with this command:
 
 ```bash
@@ -83,8 +81,7 @@ rpm -qa | grep tesseract-langpack | sed 's/tesseract-langpack-//g'
 
 For Windows:
 
-Ensure ocrmypdf in installed with
-``pip install ocrmypdf``
+You must ensure tesseract is installed
 
 Additional languages must be downloaded manually:
 Download desired .traineddata files from tessdata or tessdata_fast

diff --git a/LocalRunGuide.md b/LocalRunGuide.md
@@ -68,7 +68,7 @@ nix-env -iA nixpkgs.jbig2enc
 
 ### Step 3: Install Additional Software
 
-Next we need to install LibreOffice for conversions, ocrmypdf for OCR, and OpenCV for pattern recognition functionality.
+Next we need to install LibreOffice for conversions, qpdf for OCR, and OpenCV for pattern recognition functionality.
 
 Install the following software:
 
@@ -81,27 +81,27 @@ Install the following software:
 - unoconv
 - pngquant
 - unpaper
-- ocrmypdf
+- qpdf
 - opencv-python-headless
 
 For Debian-based systems, you can use the following command:
 
 ```bash
-sudo apt-get install -y libreoffice-writer libreoffice-calc libreoffice-impress unpaper ocrmypdf
+sudo apt-get install -y libreoffice-writer libreoffice-calc libreoffice-impress unpaper qpdf
 pip3 install uno opencv-python-headless unoconv pngquant WeasyPrint --break-system-packages
 ```
 
 For Fedora:
 
 ```bash
-sudo dnf install -y libreoffice-writer libreoffice-calc libreoffice-impress unpaper ocrmypdf
+sudo dnf install -y libreoffice-writer libreoffice-calc libreoffice-impress unpaper qpdf
 pip3 install uno opencv-python-headless unoconv pngquant WeasyPrint
 ```
 
 For Nix:
 
 ```bash
-nix-env -iA nixpkgs.unpaper nixpkgs.libreoffice nixpkgs.ocrmypdf nixpkgs.poppler_utils
+nix-env -iA nixpkgs.unpaper nixpkgs.libreoffice nixpkgs.qpdf nixpkgs.poppler_utils
 pip3 install uno opencv-python-headless unoconv pngquant WeasyPrint
 ```
 
@@ -146,7 +146,6 @@ The easiest method is to use the language packs provided by your repositories. S
 
 1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
 2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata`
-3. Please view [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for more info.
 
 **IMPORTANT:** DO NOT REMOVE EXISTING `eng.traineddata`, IT'S REQUIRED.
 

diff --git a/README.md b/README.md
@@ -79,15 +79,15 @@ All files and PDFs exist either exclusively on the client side, reside in server
 - Detect and remove blank pages
 - Compare two PDFs and show differences in text
 - Add images to PDFs
-- Compress PDFs to decrease their filesize (using OCRMyPDF)
+- Compress PDFs to decrease their filesize (using qpdf)
 - Extract images from PDF
 - Remove images from PDF
 - Extract images from scans
 - Remove annotations
 - Add page numbers
 - Auto rename file by detecting PDF header text
-- OCR on PDF (using OCRMyPDF)
-- PDF/A conversion (using OCRMyPDF)
+- OCR on PDF (using tesseract)
+- PDF/A conversion (using libreoffice)
 - Edit metadata
 - Flatten PDFs
 - Get all information on a PDF to view or export as JSON
@@ -102,7 +102,7 @@ A demo of the app is available [here](https://stirlingpdf.io).
 - Spring Boot + Thymeleaf
 - [PDFBox](https://github.com/apache/pdfbox/tree/trunk)
 - [LibreOffice](https://www.libreoffice.org/discover/libreoffice/) for advanced conversions
-- [OcrMyPdf](https://github.com/ocrmypdf/OCRmyPDF)
+- [qpdf](https://github.com/qpdf/qpdf)
 - HTML, CSS, JavaScript
 - Docker
 - [PDF.js](https://github.com/mozilla/pdf.js)

diff --git a/Version-groups.md b/Version-groups.md
@@ -8,7 +8,7 @@ The 'Fat' container contains all those found in 'Full' with security jar along w
 | Libre      |            |   ✔️   |
 | Python     |            |   ✔️   |
 | OpenCV     |            |   ✔️   |
-| OCRmyPDF   |            |   ✔️   |
+| qpdf   |            |   ✔️   |
 
 | Operation              | Ultra-Lite | Full |
 | ---------------------- | ---------- | ---- |

diff --git a/build.gradle b/build.gradle
@@ -24,7 +24,8 @@ ext {
 }
 
 group = "stirling.software"
-version = "0.34.0"
+version = "0.35.0"
+
 
 java {
     // 17 is lowest but we support and recommend 21

diff --git a/cucumber/features/external.feature b/cucumber/features/external.feature
@@ -145,7 +145,7 @@ Feature: API Validation
     And the response file should have extension ".pdf"
     And the response file should have size greater than 100
 
-  @compress @ghostscript @positive
+  @compress @qpdf @positive
   Scenario: Compress
     Given I use an example file at "exampleFiles/ghost3.pdf" as parameter "fileInput"
 	And the request data includes
@@ -156,7 +156,7 @@ Feature: API Validation
     And the response file should have extension ".pdf"
     And the response file should have size greater than 100
 
-  @compress @ghostscript @positive
+  @compress @qpdf @positive
   Scenario: Compress
     Given I use an example file at "exampleFiles/ghost2.pdf" as parameter "fileInput"
 	And the request data includes
@@ -169,7 +169,7 @@ Feature: API Validation
     And the response file should have size greater than 100
 
 
-  @compress @ghostscript @positive
+  @compress @qpdf @positive
   Scenario: Compress
     Given I use an example file at "exampleFiles/ghost1.pdf" as parameter "fileInput"
 	And the request data includes

diff --git a/scripts/replace_translation_line.sh b/scripts/replace_translation_line.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 
 translation_key="pdfToPDFA.credit"
-old_value="OCRmyPDF"
-new_value="ghostscript"
+old_value="qpdf"
+new_value="liibreoffice"
 
 for file in ../src/main/resources/messages_*.properties; do
   sed -i "/^$translation_key=/s/$old_value/$new_value/" "$file"

diff --git a/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java b/src/main/java/stirling/software/SPDF/config/EndpointConfiguration.java
@@ -188,7 +188,7 @@ public void init() {
         addEndpointToGroup("OpenCV", "extract-image-scans");
 
         // LibreOffice
-        addEndpointToGroup("LibreOffice", "repair");
+        addEndpointToGroup("qpdf", "repair");
         addEndpointToGroup("LibreOffice", "file-to-pdf");
         addEndpointToGroup("LibreOffice", "pdf-to-word");
         addEndpointToGroup("LibreOffice", "pdf-to-presentation");
@@ -199,10 +199,11 @@ public void init() {
         // Unoconv
         addEndpointToGroup("Unoconv", "file-to-pdf");
 
-        // OCRmyPDF
-        addEndpointToGroup("OCRmyPDF", "compress-pdf");
-        addEndpointToGroup("OCRmyPDF", "pdf-to-pdfa");
-        addEndpointToGroup("OCRmyPDF", "ocr-pdf");
+        // qpdf
+        addEndpointToGroup("qpdf", "compress-pdf");
+        addEndpointToGroup("qpdf", "pdf-to-pdfa");
+
+        addEndpointToGroup("tesseract", "ocr-pdf");
 
         // Java
         addEndpointToGroup("Java", "merge-pdfs");
@@ -248,10 +249,10 @@ public void init() {
         addEndpointToGroup("Javascript", "compare");
         addEndpointToGroup("Javascript", "adjust-contrast");
 
-        // Ghostscript dependent endpoints
-        addEndpointToGroup("Ghostscript", "compress-pdf");
-        addEndpointToGroup("Ghostscript", "pdf-to-pdfa");
-        addEndpointToGroup("Ghostscript", "repair");
+        // qpdf dependent endpoints
+        addEndpointToGroup("qpdf", "compress-pdf");
+        addEndpointToGroup("qpdf", "pdf-to-pdfa");
+        addEndpointToGroup("qpdf", "repair");
 
         // Weasyprint dependent endpoints
         addEndpointToGroup("Weasyprint", "html-to-pdf");

diff --git a/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java b/src/main/java/stirling/software/SPDF/config/ExternalAppDepConfig.java
@@ -37,12 +37,13 @@ private boolean isCommandAvailable(String command) {
     private final Map<String, List<String>> commandToGroupMapping =
             new HashMap<>() {
                 {
-                    put("gs", List.of("Ghostscript"));
                     put("soffice", List.of("LibreOffice"));
-                    put("ocrmypdf", List.of("OCRmyPDF"));
                     put("weasyprint", List.of("Weasyprint"));
                     put("pdftohtml", List.of("Pdftohtml"));
                     put("unoconv", List.of("Unoconv"));
+                    put("qpdf", List.of("qpdf"));
+                    put("tesseract", List.of("tesseract"));
+
                 }
             };
 
@@ -97,9 +98,9 @@ private void checkDependencyAndDisableGroup(String command) {
     public void checkDependencies() {
 
         // Check core dependencies
-        checkDependencyAndDisableGroup("gs");
+    	checkDependencyAndDisableGroup("tesseract");
         checkDependencyAndDisableGroup("soffice");
-        checkDependencyAndDisableGroup("ocrmypdf");
+        checkDependencyAndDisableGroup("qpdf");
         checkDependencyAndDisableGroup("weasyprint");
         checkDependencyAndDisableGroup("pdftohtml");
         checkDependencyAndDisableGroup("unoconv");