Skip to content

Commit

Permalink
Tika parser can be avoided by adding skip_tika: true in the config. B…
Browse files Browse the repository at this point in the history
…y default skip_tika: false of course.

Relevant Issue dadoonet#846
  • Loading branch information
shahariaazam committed Nov 30, 2019
1 parent e5970ff commit 949b333
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,9 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats,
} else if (fsSettings.getFs().isXmlSupport()) {
// https://github.com/dadoonet/fscrawler/issues/185 : Support Xml files
doc.setObject(XmlDocParser.generateMap(inputStream));
} else if (fsSettings.getFs().isSkipTika()) {
// https://github.com/dadoonet/fscrawler/issues/846 : Skip Tika parser
doc.setContent(inputStreamToString(inputStream));
} else {
// Extracting content with Tika
TikaDocParser.generate(fsSettings, inputStream, filename, fullFilename, doc, messageDigest, filesize);
Expand Down Expand Up @@ -592,4 +595,28 @@ private void esDelete(String index, String id) {
}
}

/**
* Read the stream and get the raw string
*
* @param inputStream
* @return
*/
private String inputStreamToString(InputStream inputStream){
InputStreamReader isReader = new InputStreamReader(inputStream);
BufferedReader reader = new BufferedReader(isReader);
StringBuilder sb = new StringBuilder();
String str;
try{
while((str = reader.readLine())!= null){
sb.append(str);
}

return sb.toString();
} catch (IOException e) {
e.printStackTrace();
logger.trace("Failed to read InputStreap." + e.getMessage().toString() );
}

return "";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ public class Fs {
private Ocr ocr = new Ocr();
private ByteSizeValue ignoreAbove = null;
private boolean followSymlinks = false;
private boolean skipTika = false;

public static Builder builder() {
return new Builder();
Expand Down Expand Up @@ -91,6 +92,7 @@ public static class Builder {
private Ocr ocr = new Ocr();
private ByteSizeValue ignoreAbove = null;
private boolean followSymlinks = false;
private boolean skipTika = false;

public Builder setUrl(String url) {
this.url = url;
Expand Down Expand Up @@ -246,10 +248,15 @@ public Builder setFollowSymlinks(boolean followSymlinks) {
return this;
}

public Builder setSkipTika(boolean skipTika) {
this.skipTika = skipTika;
return this;
}

public Fs build() {
return new Fs(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize,
removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata,
checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks);
checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks, skipTika);
}
}

Expand All @@ -260,7 +267,7 @@ public Fs( ) {
private Fs(String url, TimeValue updateRate, List<String> includes, List<String> excludes, List<String> filters, boolean jsonSupport,
boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource,
Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport,
boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks) {
boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks, boolean skipTika) {
this.url = url;
this.updateRate = updateRate;
this.includes = includes;
Expand All @@ -284,6 +291,7 @@ private Fs(String url, TimeValue updateRate, List<String> includes, List<String>
this.ocr = ocr;
this.ignoreAbove = ignoreAbove;
this.followSymlinks = followSymlinks;
this.skipTika = skipTika;
}

public String getUrl() {
Expand Down Expand Up @@ -478,6 +486,10 @@ public void setIgnoreAbove(ByteSizeValue ignoreAbove) {
this.ignoreAbove = ignoreAbove;
}

public boolean isSkipTika() {
return skipTika;
}

public boolean isFollowSymlinks() {
return followSymlinks;
}
Expand Down Expand Up @@ -548,6 +560,7 @@ public String toString() {
", ocr=" + ocr +
", ignoreAbove=" + ignoreAbove +
", followSymlinks=" + followSymlinks +
", skipTika=" + skipTika +
'}';
}
}

0 comments on commit 949b333

Please sign in to comment.