Skip to content

Commit

Permalink
close #1965: increase Tika SAXParser pool size, move POI static setting
Browse files Browse the repository at this point in the history
  • Loading branch information
lfcnassif committed Nov 3, 2023
1 parent 15ae2f0 commit 83cafef
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions iped-engine/src/main/java/iped/engine/task/ParsingTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
Expand All @@ -45,6 +47,7 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.IdentityHtmlMapper;
import org.apache.tika.utils.XMLReaderUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
Expand Down Expand Up @@ -159,6 +162,7 @@ public class ParsingTask extends ThumbTask implements EmbeddedDocumentExtractor
private static final Set<MediaType> typesToCheckZipBomb = getTypesToCheckZipbomb();

private static AtomicInteger containersBeingExpanded = new AtomicInteger();
private static AtomicBoolean tikaSAXPoolSizeSet = new AtomicBoolean(false);

private CategoryToExpandConfig expandConfig;
private ParsingTaskConfig parsingConfig;
Expand Down Expand Up @@ -237,8 +241,6 @@ private ParseContext getTikaContext(File output, IPEDSource ipedsource) {
context.set(ArchiveStreamFactory.class, new ArchiveStreamFactory("Cp850")); //$NON-NLS-1$
// Indexa conteudo de todos os elementos de HTMLs, como script, etc
context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
// we have seen very large records in valid docs
org.apache.poi.hpsf.CodePageString.setMaxRecordLength(512_000);

context.set(IStreamSource.class, evidence);
context.set(IItemReader.class, evidence);
Expand Down Expand Up @@ -791,6 +793,18 @@ public static void setupParsingOptions(ConfigurationManager configurationManager
ParsersConfig parserConfig = configurationManager.findObject(ParsersConfig.class);
System.setProperty("tika.config", parserConfig.getTmpConfigFile().getAbsolutePath());

// we have seen very large records in valid docs
org.apache.poi.hpsf.CodePageString.setMaxRecordLength(512_000);

// heavy Tika configuration
if (!tikaSAXPoolSizeSet.getAndSet(true)) {
try {
XMLReaderUtils.setPoolSize(Runtime.getRuntime().availableProcessors());
} catch (TikaException e) {
e.printStackTrace();
}
}

// most options below are set using sys props because they are also used by
// child external processes

Expand Down

0 comments on commit 83cafef

Please sign in to comment.