forked from vuthaihoc/tika_server_forever
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tika_config.xml
51 lines (50 loc) · 2.85 KB
/
tika_config.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<parsers>
<parser class="org.apache.tika.parser.pdf.PDFParser"/>
<parser class="org.apache.tika.parser.odf.OpenDocumentParser"/>
<parser class="org.apache.tika.parser.odf.OpenDocumentContentParser"/>
<parser class="org.apache.tika.parser.odf.OpenDocumentMetaParser"/>
<parser class="org.apache.tika.parser.opendocument.OpenOfficeParser"/>
<parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
<parser class="org.apache.tika.parser.microsoft.OfficeParser"/>
<parser class="org.apache.tika.parser.microsoft.OldExcelParser"/>
<parser class="org.apache.tika.parser.rtf.RTFParser"/>
<parser class="org.apache.tika.parser.xml.XMLParser"/>
<parser class="org.apache.tika.parser.epub.EpubParser"/>
<parser class="org.apache.tika.parser.epub.EpubContentParser"/>
<parser class="org.apache.tika.parser.chm.ChmParser"/>
<!--ocr config-->
<parser class="org.apache.tika.parser.DefaultParser">
<!-- this is not formally necessary, but prevents loading of unnecessary parser -->
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
</parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="applyRotation" type="bool">false</param>
<param name="colorSpace" type="string">gray</param>
<param name="density" type="int">300</param>
<param name="depth" type="int">4</param>
<param name="enableImagePreprocessing" type="bool">false</param>
<param name="filter" type="string">triangle</param>
<param name="imageMagickPath" type="string">/usr/local/bin/</param>
<param name="language" type="string">eng</param>
<param name="language" type="string">vie</param>
<param name="maxFileSizeToOcr" type="long">2147483647</param>
<param name="minFileSizeToOcr" type="long">0</param>
<param name="pageSegMode" type="string">1</param>
<param name="pageSeparator" type="string"></param>
<param name="preserveInterwordSpacing" type="bool">false</param>
<param name="resize" type="int">200</param>
<param name="skipOcr" type="bool">false</param>
<param name="tessdataPath" type="string">/usr/local/Cellar/tesseract/5.2.0/share/tessdata/</param>
<param name="tesseractPath" type="string">/usr/local/bin/</param>
<param name="timeoutSeconds" type="int">600</param>
</params>
</parser>
<!--ocr config-->
</parsers>
<detectors>
<detector class="org.apache.tika.mime.MagicDetector"/>
</detectors>
</properties>