diff --git a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java index ecd26e342..2470e716d 100644 --- a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java +++ b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java @@ -469,6 +469,9 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats, } else if (fsSettings.getFs().isXmlSupport()) { // https://github.com/dadoonet/fscrawler/issues/185 : Support Xml files doc.setObject(XmlDocParser.generateMap(inputStream)); + } else if (fsSettings.getFs().isSkipTika()) { + // https://github.com/dadoonet/fscrawler/issues/846 : Skip Tika parser + doc.setContent(inputStreamToString(inputStream)); } else { // Extracting content with Tika TikaDocParser.generate(fsSettings, inputStream, filename, fullFilename, doc, messageDigest, filesize); @@ -592,4 +595,28 @@ private void esDelete(String index, String id) { } } + /** + * Read the stream and get the raw string + * + * @param inputStream + * @return + */ + private String inputStreamToString(InputStream inputStream){ + InputStreamReader isReader = new InputStreamReader(inputStream); + BufferedReader reader = new BufferedReader(isReader); + StringBuilder sb = new StringBuilder(); + String str; + try{ + while((str = reader.readLine())!= null){ + sb.append(str); + } + + return sb.toString(); + } catch (IOException e) { + e.printStackTrace(); + logger.trace("Failed to read InputStreap." + e.getMessage().toString() ); + } + + return ""; + } } diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 3aef0eb33..bffd65c14 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -58,6 +58,7 @@ public class Fs { private Ocr ocr = new Ocr(); private ByteSizeValue ignoreAbove = null; private boolean followSymlinks = false; + private boolean skipTika = false; public static Builder builder() { return new Builder(); @@ -91,6 +92,7 @@ public static class Builder { private Ocr ocr = new Ocr(); private ByteSizeValue ignoreAbove = null; private boolean followSymlinks = false; + private boolean skipTika = false; public Builder setUrl(String url) { this.url = url; @@ -246,10 +248,15 @@ public Builder setFollowSymlinks(boolean followSymlinks) { return this; } + public Builder setSkipTika(boolean skipTika) { + this.skipTika = skipTika; + return this; + } + public Fs build() { return new Fs(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, - checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks); + checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks, skipTika); } } @@ -260,7 +267,7 @@ public Fs( ) { private Fs(String url, TimeValue updateRate, List includes, List excludes, List filters, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, - boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks) { + boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks, boolean skipTika) { this.url = url; this.updateRate = updateRate; this.includes = includes; @@ -284,6 +291,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.ocr = ocr; this.ignoreAbove = ignoreAbove; this.followSymlinks = followSymlinks; + this.skipTika = skipTika; } public String getUrl() { @@ -478,6 +486,10 @@ public void setIgnoreAbove(ByteSizeValue ignoreAbove) { this.ignoreAbove = ignoreAbove; } + public boolean isSkipTika() { + return skipTika; + } + public boolean isFollowSymlinks() { return followSymlinks; } @@ -548,6 +560,7 @@ public String toString() { ", ocr=" + ocr + ", ignoreAbove=" + ignoreAbove + ", followSymlinks=" + followSymlinks + + ", skipTika=" + skipTika + '}'; } }