XWIKI-22335: Cannot import a ppt with LibreOffice 24.2.5 (#3517)

* Convert to PDF and extract images from the PDF * Adapt the unit test * Change the output format to PNG * Change the default image format back to JPEG. * Read the old office converter configuration for size and quality. * Add new configuration options for format, quality and image size. * Revert "[Misc] Force using old version of LO until XWIKI-22335 is done" * Revert "[Misc] Revert LO upgrade because of XWIKI-22335" * Fix existing comments in DefaultPresentationBuilder * Provide new properties in xwiki.properties * Fallback in document-formats.js in case custom-document-formats is not used for backward compatibility * Provide unit test * Improve a bit ServletContainerExecutor to have better understanding in case of error when building LO image This reverts commit 29e4543. This reverts commit f19ea82. This reverts commit 32a20ae. --------- Co-authored-by: Simon Urli <[email protected]> (cherry picked from commit acd6ed3)
xwiki · Sep 26, 2024 · 1e71121 · 1e71121
1 parent f5318af
commit 1e71121
Show file tree

Hide file tree

Showing 10 changed files with 347 additions and 66 deletions.
diff --git a/pom.xml b/pom.xml
@@ -77,7 +77,7 @@
          Note: We don't need to exact version (e.g. 7.2.7.2) since the LTS is made available using a max of 2 dots
          (e.g. 7.2.7).
     -->
-    <libreoffice.version>7.6.7</libreoffice.version>
+    <libreoffice.version>24.2.6</libreoffice.version>
 
     <!-- By default check that unit tests don't output anything to the console -->
     <xwiki.surefire.captureconsole.skip>false</xwiki.surefire.captureconsole.skip>

diff --git a/...r/src/main/java/org/xwiki/officeimporter/internal/builder/DefaultPresentationBuilder.java b/...r/src/main/java/org/xwiki/officeimporter/internal/builder/DefaultPresentationBuilder.java
@@ -19,18 +19,20 @@
  */
 package org.xwiki.officeimporter.internal.builder;
 
+import java.awt.image.BufferedImage;
+import java.io.BufferedOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringReader;
-import java.io.UnsupportedEncodingException;
 import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import javax.inject.Inject;
@@ -40,6 +42,10 @@
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.tuple.Pair;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 import org.w3c.dom.Document;
 import org.xwiki.bridge.DocumentAccessBridge;
 import org.xwiki.component.annotation.Component;
@@ -76,8 +82,6 @@
 @Singleton
 public class DefaultPresentationBuilder implements PresentationBuilder
 {
-    private static final Pattern SLIDE_FORMAT = Pattern.compile("img(?<number>[0-9]+)\\.jpg");
-
     /**
      * Provides the component manager used by {@link XDOMOfficeDocument}.
      */
@@ -117,6 +121,9 @@ public class DefaultPresentationBuilder implements PresentationBuilder
     @Named("xhtml/1.0")
     private Parser xhtmlParser;
 
+    @Inject
+    private PresentationBuilderConfiguration presentationBuilderConfiguration;
+
     @Override
     public XDOMOfficeDocument build(InputStream officeFileStream, String officeFileName,
         DocumentReference documentReference) throws OfficeImporterException
@@ -159,22 +166,17 @@ protected OfficeConverterResult importPresentation(InputStream officeFileStream,
         Map<String, InputStream> inputStreams = Map.of(inputFileName, officeFileStream);
         try {
             // The office converter uses the output file name extension to determine the output format/syntax.
-            // The returned artifacts are of three types: imgX.jpg (slide screen shot), imgX.html (HTML page that
-            // display the corresponding slide screen shot) and textX.html (HTML page that display the text extracted
-            // from the corresponding slide). We use "img0.html" as the output file name because the corresponding
-            // artifact displays a screen shot of the first presentation slide.
-            return this.officeServer.getConverter().convertDocument(inputStreams, inputFileName, "img0.html");
+            // We perform a conversion to PDF to then use a PDF to image conversion.
+            return this.officeServer.getConverter().convertDocument(inputStreams, inputFileName, "presentation.pdf");
         } catch (OfficeConverterException e) {
             String message = "Error while converting document [%s] into html.";
             throw new OfficeImporterException(String.format(message, officeFileName), e);
         }
     }
 
     /**
-     * Builds the presentation HTML from the presentation artifacts. There are two types of presentation artifacts:
-     * slide image and slide text. The returned HTML will display all the slide images. Slide text is currently ignored.
-     * All artifacts except slide images are removed from {@code presentationArtifacts}. Slide images names are prefixed
-     * with the given {@code nameSpace} to avoid name conflicts.
+     * Builds the presentation HTML from the presentation PDF: we convert all PDF page to an image using naming
+     * convention {@code slideX.imageFormatExtension} where X is the slide number.
      * 
      * @param officeConverterResult the map of presentation artifacts; this method removes some of the presentation
      *            artifacts and renames others so be aware of the side effects
@@ -185,36 +187,54 @@ protected Pair<String, Map<String, OfficeDocumentArtifact>> buildPresentationHTM
         OfficeConverterResult officeConverterResult, String nameSpace) throws IOException
     {
         Map<String, OfficeDocumentArtifact> artifactFiles = new HashMap<>();
-        // Iterate all the slides.
-        Set<File> conversionOutputFiles = officeConverterResult.getAllFiles();
-        Map<Integer, String> filenames = new HashMap<>();
-        for (File conversionOutputFile : conversionOutputFiles) {
-            Matcher matcher = SLIDE_FORMAT.matcher(conversionOutputFile.getName());
-            if (matcher.matches()) {
-                String number = matcher.group("number");
-                String slideImageName = String.format("%s-slide%s.jpg", nameSpace, number);
-                artifactFiles.put(slideImageName, new FileOfficeDocumentArtifact(slideImageName, conversionOutputFile));
-                // Append slide image to the presentation HTML.
-                String slideImageURL = null;
-                try {
-                    // We need to encode the slide image name in case it contains special URL characters.
-                    slideImageURL = URLEncoder.encode(slideImageName, "UTF-8");
-                } catch (UnsupportedEncodingException e) {
-                    // This should never happen.
+        String imageFormat = this.presentationBuilderConfiguration.getImageFormat();
+        float quality = this.presentationBuilderConfiguration.getQuality();
+
+        // We converted the slides to PDF. Now convert each page of the PDF to an image.
+        List<String> filenames = new ArrayList<>();
+        try (PDDocument document = PDDocument.load(officeConverterResult.getOutputFile())) {
+            PDFRenderer pdfRenderer = new PDFRenderer(document);
+            int numberOfPages = document.getPages().getCount();
+            for (int pageCounter = 0; pageCounter < numberOfPages; ++pageCounter) {
+                // note that the page number parameter is zero based
+                // Compute the DPI based on the slide width.
+                int outputWidth = this.presentationBuilderConfiguration.getSlideWidth();
+                // Get the width of the slide in points.
+                float pageWidth = document.getPage(pageCounter).getMediaBox().getWidth();
+                // Compute the DPI based on the slide width.
+                float dpi = outputWidth / pageWidth * 72;
+
+                BufferedImage bim = pdfRenderer.renderImageWithDPI(pageCounter, dpi, ImageType.RGB);
+
+                String slideFileName = String.format("slide%s.%s", pageCounter, imageFormat);
+
+                // Store the image in the output directory as this will be cleaned up automatically at the end.
+                File imageFile = new File(officeConverterResult.getOutputDirectory(), slideFileName);
+                try (BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(imageFile))) {
+                    ImageIOUtil.writeImage(bim, imageFormat, outputStream, (int) dpi, quality);
                 }
+
+                String slideImageName = String.format("%s-slide%s.%s", nameSpace, pageCounter, imageFormat);
+                artifactFiles.put(slideImageName, new FileOfficeDocumentArtifact(slideImageName, imageFile));
+                // suffix in filename will be used as the file format
+
+                // Append slide image to the presentation HTML.
+                // We need to encode the slide image name in case it contains special URL characters.
+                String slideImageURL = URLEncoder.encode(slideImageName, StandardCharsets.UTF_8);
                 // We do not want to encode the spaces in '+' since '+' will be then reencoded in
                 // ImageFilter to keep it and not consider it as a space when decoding it.
                 // This is link to a bug in libreoffice that does not convert properly the '+', so we cannot distinguish
                 // them from spaces in filenames. This should be removed once
                 // https://github.com/sbraconnier/jodconverter/issues/125 is fixed.
                 slideImageURL = slideImageURL.replace('+', ' ');
 
-                filenames.put(Integer.parseInt(number), slideImageURL);
+                filenames.add(slideImageURL);
             }
         }
+
         // We sort by number so that the filenames are ordered by slide number.
-        String presentationHTML = filenames.entrySet().stream().sorted(Map.Entry.comparingByKey())
-            .map(entry -> String.format("<p><img src=\"%s\"/></p>", XMLUtils.escapeAttributeValue(entry.getValue())))
+        String presentationHTML = filenames.stream()
+            .map(entry -> String.format("<p><img src=\"%s\"/></p>", XMLUtils.escapeAttributeValue(entry)))
             .collect(Collectors.joining());
         return Pair.of(presentationHTML, artifactFiles);
     }

diff --git a/...main/java/org/xwiki/officeimporter/internal/builder/PresentationBuilderConfiguration.java b/...main/java/org/xwiki/officeimporter/internal/builder/PresentationBuilderConfiguration.java
@@ -0,0 +1,141 @@
+/*
+ * See the NOTICE file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * This is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this software; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
+ */
+package org.xwiki.officeimporter.internal.builder;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Optional;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import javax.inject.Inject;
+import javax.inject.Named;
+import javax.inject.Singleton;
+
+import org.slf4j.Logger;
+import org.xwiki.component.annotation.Component;
+import org.xwiki.component.phase.Initializable;
+import org.xwiki.component.phase.InitializationException;
+import org.xwiki.configuration.ConfigurationSource;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ * Configuration for the presentation builder.
+ *
+ * @version $Id$
+ * @since 16.8.0
+ * @since 16.4.4
+ * @since 15.10.13
+ */
+@Component(roles = PresentationBuilderConfiguration.class)
+@Singleton
+public class PresentationBuilderConfiguration implements Initializable
+{
+    private static final String DEFAULT_IMAGE_FORMAT = "jpg";
+
+    private int slideWidth = 1920;
+
+    private float quality = 95;
+
+    @Inject
+    @Named("xwikiproperties")
+    private ConfigurationSource configurationSource;
+
+    @Inject
+    private Logger logger;
+
+    @Override
+    public void initialize() throws InitializationException
+    {
+        try {
+            // For backward compatibility reason we check both custom-document-formats and document-formats.
+            if (!extractConfigurationFromJsonRegistry("/custom-document-formats.json")) {
+                extractConfigurationFromJsonRegistry("/document-formats.js");
+            }
+        } catch (Exception e) {
+            this.logger.error("Error when initializing values from document format registry, "
+                + "default values will be used.", e);
+        }
+    }
+
+    private boolean extractConfigurationFromJsonRegistry(String filename) throws IOException
+    {
+        AtomicBoolean result = new AtomicBoolean(false);
+        try (InputStream configurationInput = getClass().getResourceAsStream(filename)) {
+            if (configurationInput != null) {
+                // Load the configuration from the JSON file
+                ObjectMapper objectMapper = new ObjectMapper();
+                // Read the JSON which should be an array of JSON objects
+                // Each JSON object should have the following properties: name (string), storeProperties (object with
+                // key "PRESENTATION" that again has a key FilterData with keys Quality and Width that are integers).
+                JsonNode jsonNode = objectMapper.readTree(configurationInput);
+                for (JsonNode formatNode : jsonNode) {
+                    getJsonNode(formatNode, "name")
+                        .filter(nameNode -> "HTML".equals(nameNode.asText()))
+                        .flatMap(nameNode -> getJsonNode(formatNode, "storeProperties"))
+                        .flatMap(storeProperties -> getJsonNode(storeProperties, "PRESENTATION"))
+                        .flatMap(presentationProperties -> getJsonNode(presentationProperties, "FilterData"))
+                        .ifPresent(filterData -> {
+                            result.set(true);
+                            getJsonNode(filterData, "Quality")
+                                .ifPresent(qualityNode -> this.quality = qualityNode.asInt());
+                            getJsonNode(filterData, "Width")
+                                .ifPresent(widthNode -> this.slideWidth = widthNode.asInt());
+                        });
+                }
+            }
+        }
+        return result.get();
+    }
+
+    private Optional<JsonNode> getJsonNode(JsonNode jsonNode, String fieldName)
+    {
+        return Optional.ofNullable(jsonNode.get(fieldName));
+    }
+
+    /**
+     * @return the width of the images to generate for slides in pixels
+     */
+    public int getSlideWidth()
+    {
+        return this.configurationSource.getProperty("officeimporter.presentation.slideWidth", this.slideWidth);
+    }
+
+    /**
+     * @return the image quality to use when converting slides to images
+     */
+    public float getQuality()
+    {
+        if ("png".equals(this.getImageFormat())) {
+            return 0f;
+        }
+
+        return this.configurationSource.getProperty("officeimporter.presentation.quality", this.quality) / 100f;
+    }
+
+    /**
+     * @return the image format to use when converting slides to images.
+     */
+    public String getImageFormat()
+    {
+        return this.configurationSource.getProperty("officeimporter.presentation.imageFormat", DEFAULT_IMAGE_FORMAT);
+    }
+}
diff --git a/...platform-office/xwiki-platform-office-importer/src/main/resources/META-INF/components.txt b/...platform-office/xwiki-platform-office-importer/src/main/resources/META-INF/components.txt
@@ -2,6 +2,7 @@ org.xwiki.officeimporter.internal.ModelBridge
 org.xwiki.officeimporter.internal.builder.DefaultXHTMLOfficeDocumentBuilder
 org.xwiki.officeimporter.internal.builder.DefaultXDOMOfficeDocumentBuilder
 org.xwiki.officeimporter.internal.builder.DefaultPresentationBuilder
+org.xwiki.officeimporter.internal.builder.PresentationBuilderConfiguration
 org.xwiki.officeimporter.internal.cleaner.OfficeHTMLCleaner
 org.xwiki.officeimporter.internal.cleaner.WysiwygHTMLCleaner
 org.xwiki.officeimporter.internal.filter.AnchorFilter