Skip to content

Commit

Permalink
XWIKI-22335: Cannot import a ppt with LibreOffice 24.2.5 (#3517)
Browse files Browse the repository at this point in the history
* Convert to PDF and extract images from the PDF
* Adapt the unit test
* Change the output format to PNG
* Change the default image format back to JPEG.
* Read the old office converter configuration for size and quality.
* Add new configuration options for format, quality and image size.
* Revert "[Misc] Force using old version of LO until XWIKI-22335 is done"
* Revert "[Misc] Revert LO upgrade because of XWIKI-22335"
* Fix existing comments in DefaultPresentationBuilder
* Provide new properties in xwiki.properties
* Fallback in document-formats.js in case custom-document-formats is
    not used for backward compatibility
* Provide unit test
* Improve a bit ServletContainerExecutor to have better understanding
    in case of error when building LO image

This reverts commit 29e4543.
This reverts commit f19ea82.
This reverts commit 32a20ae.

---------

Co-authored-by: Simon Urli <[email protected]>
(cherry picked from commit acd6ed3)
  • Loading branch information
michitux authored and surli committed Sep 26, 2024
1 parent f5318af commit 1e71121
Show file tree
Hide file tree
Showing 10 changed files with 347 additions and 66 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
Note: We don't need to exact version (e.g. 7.2.7.2) since the LTS is made available using a max of 2 dots
(e.g. 7.2.7).
-->
<libreoffice.version>7.6.7</libreoffice.version>
<libreoffice.version>24.2.6</libreoffice.version>

<!-- By default check that unit tests don't output anything to the console -->
<xwiki.surefire.captureconsole.skip>false</xwiki.surefire.captureconsole.skip>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,20 @@
*/
package org.xwiki.officeimporter.internal.builder;

import java.awt.image.BufferedImage;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.inject.Inject;
Expand All @@ -40,6 +42,10 @@

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.w3c.dom.Document;
import org.xwiki.bridge.DocumentAccessBridge;
import org.xwiki.component.annotation.Component;
Expand Down Expand Up @@ -76,8 +82,6 @@
@Singleton
public class DefaultPresentationBuilder implements PresentationBuilder
{
private static final Pattern SLIDE_FORMAT = Pattern.compile("img(?<number>[0-9]+)\\.jpg");

/**
* Provides the component manager used by {@link XDOMOfficeDocument}.
*/
Expand Down Expand Up @@ -117,6 +121,9 @@ public class DefaultPresentationBuilder implements PresentationBuilder
@Named("xhtml/1.0")
private Parser xhtmlParser;

@Inject
private PresentationBuilderConfiguration presentationBuilderConfiguration;

@Override
public XDOMOfficeDocument build(InputStream officeFileStream, String officeFileName,
DocumentReference documentReference) throws OfficeImporterException
Expand Down Expand Up @@ -159,22 +166,17 @@ protected OfficeConverterResult importPresentation(InputStream officeFileStream,
Map<String, InputStream> inputStreams = Map.of(inputFileName, officeFileStream);
try {
// The office converter uses the output file name extension to determine the output format/syntax.
// The returned artifacts are of three types: imgX.jpg (slide screen shot), imgX.html (HTML page that
// display the corresponding slide screen shot) and textX.html (HTML page that display the text extracted
// from the corresponding slide). We use "img0.html" as the output file name because the corresponding
// artifact displays a screen shot of the first presentation slide.
return this.officeServer.getConverter().convertDocument(inputStreams, inputFileName, "img0.html");
// We perform a conversion to PDF to then use a PDF to image conversion.
return this.officeServer.getConverter().convertDocument(inputStreams, inputFileName, "presentation.pdf");
} catch (OfficeConverterException e) {
String message = "Error while converting document [%s] into html.";
throw new OfficeImporterException(String.format(message, officeFileName), e);
}
}

/**
* Builds the presentation HTML from the presentation artifacts. There are two types of presentation artifacts:
* slide image and slide text. The returned HTML will display all the slide images. Slide text is currently ignored.
* All artifacts except slide images are removed from {@code presentationArtifacts}. Slide images names are prefixed
* with the given {@code nameSpace} to avoid name conflicts.
* Builds the presentation HTML from the presentation PDF: we convert all PDF page to an image using naming
* convention {@code slideX.imageFormatExtension} where X is the slide number.
*
* @param officeConverterResult the map of presentation artifacts; this method removes some of the presentation
* artifacts and renames others so be aware of the side effects
Expand All @@ -185,36 +187,54 @@ protected Pair<String, Map<String, OfficeDocumentArtifact>> buildPresentationHTM
OfficeConverterResult officeConverterResult, String nameSpace) throws IOException
{
Map<String, OfficeDocumentArtifact> artifactFiles = new HashMap<>();
// Iterate all the slides.
Set<File> conversionOutputFiles = officeConverterResult.getAllFiles();
Map<Integer, String> filenames = new HashMap<>();
for (File conversionOutputFile : conversionOutputFiles) {
Matcher matcher = SLIDE_FORMAT.matcher(conversionOutputFile.getName());
if (matcher.matches()) {
String number = matcher.group("number");
String slideImageName = String.format("%s-slide%s.jpg", nameSpace, number);
artifactFiles.put(slideImageName, new FileOfficeDocumentArtifact(slideImageName, conversionOutputFile));
// Append slide image to the presentation HTML.
String slideImageURL = null;
try {
// We need to encode the slide image name in case it contains special URL characters.
slideImageURL = URLEncoder.encode(slideImageName, "UTF-8");
} catch (UnsupportedEncodingException e) {
// This should never happen.
String imageFormat = this.presentationBuilderConfiguration.getImageFormat();
float quality = this.presentationBuilderConfiguration.getQuality();

// We converted the slides to PDF. Now convert each page of the PDF to an image.
List<String> filenames = new ArrayList<>();
try (PDDocument document = PDDocument.load(officeConverterResult.getOutputFile())) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
int numberOfPages = document.getPages().getCount();
for (int pageCounter = 0; pageCounter < numberOfPages; ++pageCounter) {
// note that the page number parameter is zero based
// Compute the DPI based on the slide width.
int outputWidth = this.presentationBuilderConfiguration.getSlideWidth();
// Get the width of the slide in points.
float pageWidth = document.getPage(pageCounter).getMediaBox().getWidth();
// Compute the DPI based on the slide width.
float dpi = outputWidth / pageWidth * 72;

BufferedImage bim = pdfRenderer.renderImageWithDPI(pageCounter, dpi, ImageType.RGB);

String slideFileName = String.format("slide%s.%s", pageCounter, imageFormat);

// Store the image in the output directory as this will be cleaned up automatically at the end.
File imageFile = new File(officeConverterResult.getOutputDirectory(), slideFileName);
try (BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(imageFile))) {
ImageIOUtil.writeImage(bim, imageFormat, outputStream, (int) dpi, quality);
}

String slideImageName = String.format("%s-slide%s.%s", nameSpace, pageCounter, imageFormat);
artifactFiles.put(slideImageName, new FileOfficeDocumentArtifact(slideImageName, imageFile));
// suffix in filename will be used as the file format

// Append slide image to the presentation HTML.
// We need to encode the slide image name in case it contains special URL characters.
String slideImageURL = URLEncoder.encode(slideImageName, StandardCharsets.UTF_8);
// We do not want to encode the spaces in '+' since '+' will be then reencoded in
// ImageFilter to keep it and not consider it as a space when decoding it.
// This is link to a bug in libreoffice that does not convert properly the '+', so we cannot distinguish
// them from spaces in filenames. This should be removed once
// https://github.com/sbraconnier/jodconverter/issues/125 is fixed.
slideImageURL = slideImageURL.replace('+', ' ');

filenames.put(Integer.parseInt(number), slideImageURL);
filenames.add(slideImageURL);
}
}

// We sort by number so that the filenames are ordered by slide number.
String presentationHTML = filenames.entrySet().stream().sorted(Map.Entry.comparingByKey())
.map(entry -> String.format("<p><img src=\"%s\"/></p>", XMLUtils.escapeAttributeValue(entry.getValue())))
String presentationHTML = filenames.stream()
.map(entry -> String.format("<p><img src=\"%s\"/></p>", XMLUtils.escapeAttributeValue(entry)))
.collect(Collectors.joining());
return Pair.of(presentationHTML, artifactFiles);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.xwiki.officeimporter.internal.builder;

import java.io.IOException;
import java.io.InputStream;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;

import javax.inject.Inject;
import javax.inject.Named;
import javax.inject.Singleton;

import org.slf4j.Logger;
import org.xwiki.component.annotation.Component;
import org.xwiki.component.phase.Initializable;
import org.xwiki.component.phase.InitializationException;
import org.xwiki.configuration.ConfigurationSource;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

/**
* Configuration for the presentation builder.
*
* @version $Id$
* @since 16.8.0
* @since 16.4.4
* @since 15.10.13
*/
@Component(roles = PresentationBuilderConfiguration.class)
@Singleton
public class PresentationBuilderConfiguration implements Initializable
{
private static final String DEFAULT_IMAGE_FORMAT = "jpg";

private int slideWidth = 1920;

private float quality = 95;

@Inject
@Named("xwikiproperties")
private ConfigurationSource configurationSource;

@Inject
private Logger logger;

@Override
public void initialize() throws InitializationException
{
try {
// For backward compatibility reason we check both custom-document-formats and document-formats.
if (!extractConfigurationFromJsonRegistry("/custom-document-formats.json")) {
extractConfigurationFromJsonRegistry("/document-formats.js");
}
} catch (Exception e) {
this.logger.error("Error when initializing values from document format registry, "
+ "default values will be used.", e);
}
}

private boolean extractConfigurationFromJsonRegistry(String filename) throws IOException
{
AtomicBoolean result = new AtomicBoolean(false);
try (InputStream configurationInput = getClass().getResourceAsStream(filename)) {
if (configurationInput != null) {
// Load the configuration from the JSON file
ObjectMapper objectMapper = new ObjectMapper();
// Read the JSON which should be an array of JSON objects
// Each JSON object should have the following properties: name (string), storeProperties (object with
// key "PRESENTATION" that again has a key FilterData with keys Quality and Width that are integers).
JsonNode jsonNode = objectMapper.readTree(configurationInput);
for (JsonNode formatNode : jsonNode) {
getJsonNode(formatNode, "name")
.filter(nameNode -> "HTML".equals(nameNode.asText()))
.flatMap(nameNode -> getJsonNode(formatNode, "storeProperties"))
.flatMap(storeProperties -> getJsonNode(storeProperties, "PRESENTATION"))
.flatMap(presentationProperties -> getJsonNode(presentationProperties, "FilterData"))
.ifPresent(filterData -> {
result.set(true);
getJsonNode(filterData, "Quality")
.ifPresent(qualityNode -> this.quality = qualityNode.asInt());
getJsonNode(filterData, "Width")
.ifPresent(widthNode -> this.slideWidth = widthNode.asInt());
});
}
}
}
return result.get();
}

private Optional<JsonNode> getJsonNode(JsonNode jsonNode, String fieldName)
{
return Optional.ofNullable(jsonNode.get(fieldName));
}

/**
* @return the width of the images to generate for slides in pixels
*/
public int getSlideWidth()
{
return this.configurationSource.getProperty("officeimporter.presentation.slideWidth", this.slideWidth);
}

/**
* @return the image quality to use when converting slides to images
*/
public float getQuality()
{
if ("png".equals(this.getImageFormat())) {
return 0f;
}

return this.configurationSource.getProperty("officeimporter.presentation.quality", this.quality) / 100f;
}

/**
* @return the image format to use when converting slides to images.
*/
public String getImageFormat()
{
return this.configurationSource.getProperty("officeimporter.presentation.imageFormat", DEFAULT_IMAGE_FORMAT);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ org.xwiki.officeimporter.internal.ModelBridge
org.xwiki.officeimporter.internal.builder.DefaultXHTMLOfficeDocumentBuilder
org.xwiki.officeimporter.internal.builder.DefaultXDOMOfficeDocumentBuilder
org.xwiki.officeimporter.internal.builder.DefaultPresentationBuilder
org.xwiki.officeimporter.internal.builder.PresentationBuilderConfiguration
org.xwiki.officeimporter.internal.cleaner.OfficeHTMLCleaner
org.xwiki.officeimporter.internal.cleaner.WysiwygHTMLCleaner
org.xwiki.officeimporter.internal.filter.AnchorFilter
Expand Down
Loading

0 comments on commit 1e71121

Please sign in to comment.