diff --git a/.gitmodules b/.gitmodules index 9a01451a21..f758ba0123 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "test"] path = test url = https://github.com/tesseract-ocr/test +[submodule "tessdata"] + path = tessdata + url = https://github.com/tesseract-ocr/tessconfigs diff --git a/Makefile.am b/Makefile.am index e427170e14..f55df8c2ac 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,9 @@ ## run autogen.sh to create Makefile.in from this file + +# Default location for tessdata directory. +# It can be overriden with configure option --datarootdir=DIR. +datadir = @datarootdir@/tessdata + ACLOCAL_AMFLAGS = -I m4 if ENABLE_TRAINING @@ -18,17 +23,34 @@ training: @echo "Need to reconfigure project, so there are no errors" endif -.PHONY: doc install-langs ScrollView.jar install-jars training +.PHONY: doc install-langs ScrollView.jar install-jars install-tessdata training SUBDIRS = src/arch src/ccutil src/viewer src/cutil src/opencl src/ccstruct SUBDIRS += src/dict src/classify src/wordrec src/textord src/lstm -SUBDIRS += src/ccmain src/api . tessdata doc unittest +SUBDIRS += src/ccmain src/api . doc unittest EXTRA_DIST = README.md LICENSE EXTRA_DIST += aclocal.m4 config configure.ac autogen.sh EXTRA_DIST += tesseract.pc.in $(TRAINING_SUBDIR) java doc EXTRA_DIST += CMakeLists.txt tesseract.pc.cmake cmake VERSION src/vs2010 cppan.yml +# Files for tessdata. +TESSDATA_FILES = $(top_srcdir)/tessdata/pdf.ttf + +# Files for tessdata/configs. +CONFIG_FILES = tessdata/pdf.ttf +CONFIG_FILES = inter makebox box.train unlv ambigs.train lstm.train lstmdebug +CONFIG_FILES += api_config kannada box.train.stderr quiet logfile digits get.images +CONFIG_FILES += lstmbox wordstrbox +# Configurations for OCR output. +CONFIG_FILES += alto hocr pdf tsv txt +CONFIG_FILES += linebox rebox strokewidth bigram + +# Files for tessdata/tessconfigs. +TESSDATA_CONFIG_FILES = ${CONFIG_FILES:%=$(top_srcdir)/tessdata/configs/%} +TESSCONFIG_FILES = batch batch.nochop nobatch matdemo segdemo msdemo +TESSDATA_TESSCONFIG_FILES = ${TESSCONFIG_FILES:%=$(top_srcdir)/tessdata/tessconfigs/%} + DIST_SUBDIRS = $(SUBDIRS) $(TRAINING_SUBDIR) uninstall-hook: @@ -50,6 +72,14 @@ ScrollView.jar: install-jars: @cd "$(top_builddir)/java" && $(MAKE) $@ +install-tessdata: + mkdir -p $(DESTDIR)$(datadir) + $(INSTALL) -m 644 $(TESSDATA_FILES) $(DESTDIR)$(datadir) + mkdir -p $(DESTDIR)$(datadir)/configs + $(INSTALL) -m 644 $(TESSDATA_CONFIG_FILES) $(DESTDIR)$(datadir)/configs + mkdir -p $(DESTDIR)$(datadir)/tessconfigs + $(INSTALL) -m 644 $(TESSDATA_TESSCONFIG_FILES) $(DESTDIR)$(datadir)/tessconfigs + doc: -srcdir="$(top_srcdir)" builddir="$(top_builddir)" \ version="@PACKAGE_VERSION@" name="@PACKAGE_NAME@" \ diff --git a/configure.ac b/configure.ac index a36656b06e..81a4b5bb8b 100644 --- a/configure.ac +++ b/configure.ac @@ -482,9 +482,6 @@ AC_CONFIG_FILES([src/lstm/Makefile]) AC_CONFIG_FILES([src/textord/Makefile]) AC_CONFIG_FILES([src/viewer/Makefile]) AC_CONFIG_FILES([src/wordrec/Makefile]) -AC_CONFIG_FILES([tessdata/Makefile]) -AC_CONFIG_FILES([tessdata/configs/Makefile]) -AC_CONFIG_FILES([tessdata/tessconfigs/Makefile]) AC_CONFIG_FILES([unittest/Makefile]) AC_CONFIG_FILES([java/Makefile]) AC_CONFIG_FILES([java/com/Makefile]) diff --git a/tessdata b/tessdata new file mode 160000 index 0000000000..3c86eb0462 --- /dev/null +++ b/tessdata @@ -0,0 +1 @@ +Subproject commit 3c86eb0462cb00c29b4b8bc4cc54002547d5d14e diff --git a/tessdata/Makefile.am b/tessdata/Makefile.am deleted file mode 100644 index 65ec7f0f25..0000000000 --- a/tessdata/Makefile.am +++ /dev/null @@ -1,12 +0,0 @@ -datadir = @datadir@/tessdata - -data_DATA = pdf.ttf -EXTRA_DIST = $(data_DATA) - -SUBDIRS = configs tessconfigs - -langdata = - -uninstall-local: - cd $(DESTDIR)$(datadir); \ - rm -f $(langdata) diff --git a/tessdata/configs/Makefile.am b/tessdata/configs/Makefile.am deleted file mode 100644 index 90619378f8..0000000000 --- a/tessdata/configs/Makefile.am +++ /dev/null @@ -1,8 +0,0 @@ -datadir = @datadir@/tessdata/configs -data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug -data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images -data_DATA += lstmbox wordstrbox -# Configurations for OCR output. -data_DATA += alto hocr pdf tsv txt -data_DATA += linebox rebox strokewidth bigram -EXTRA_DIST = $(data_DATA) diff --git a/tessdata/configs/alto b/tessdata/configs/alto deleted file mode 100644 index 0dd12a7a70..0000000000 --- a/tessdata/configs/alto +++ /dev/null @@ -1 +0,0 @@ -tessedit_create_alto 1 diff --git a/tessdata/configs/ambigs.train b/tessdata/configs/ambigs.train deleted file mode 100644 index 23035a1904..0000000000 --- a/tessdata/configs/ambigs.train +++ /dev/null @@ -1,7 +0,0 @@ -tessedit_ambigs_training 1 -load_freq_dawg 0 -load_punc_dawg 0 -load_system_dawg 0 -load_number_dawg 0 -ambigs_debug_level 3 -load_fixed_length_dawgs 0 diff --git a/tessdata/configs/api_config b/tessdata/configs/api_config deleted file mode 100644 index 5cd6ec0310..0000000000 --- a/tessdata/configs/api_config +++ /dev/null @@ -1 +0,0 @@ -tessedit_zero_rejection T diff --git a/tessdata/configs/bazaar b/tessdata/configs/bazaar deleted file mode 100644 index 1b2ee831ce..0000000000 --- a/tessdata/configs/bazaar +++ /dev/null @@ -1,4 +0,0 @@ -load_system_dawg F -load_freq_dawg F -user_words_suffix user-words -user_patterns_suffix user-patterns diff --git a/tessdata/configs/bigram b/tessdata/configs/bigram deleted file mode 100644 index 5d6c2d061f..0000000000 --- a/tessdata/configs/bigram +++ /dev/null @@ -1,5 +0,0 @@ -load_bigram_dawg True -tessedit_enable_bigram_correction True -tessedit_bigram_debug 3 -save_raw_choices True -save_alt_choices True diff --git a/tessdata/configs/box.train b/tessdata/configs/box.train deleted file mode 100644 index f1836cff14..0000000000 --- a/tessdata/configs/box.train +++ /dev/null @@ -1,13 +0,0 @@ -disable_character_fragments T -file_type .bl -textord_fast_pitch_test T -tessedit_zero_rejection T -tessedit_minimal_rejection F -tessedit_write_rep_codes F -il1_adaption_test 1 -edges_children_fix F -edges_childarea 0.65 -edges_boxarea 0.9 -tessedit_resegment_from_boxes T -tessedit_train_from_boxes T -textord_no_rejects T diff --git a/tessdata/configs/box.train.stderr b/tessdata/configs/box.train.stderr deleted file mode 100644 index a5bd4bd0ed..0000000000 --- a/tessdata/configs/box.train.stderr +++ /dev/null @@ -1,14 +0,0 @@ -file_type .bl -#tessedit_use_nn F -textord_fast_pitch_test T -tessedit_zero_rejection T -tessedit_minimal_rejection F -tessedit_write_rep_codes F -il1_adaption_test 1 -edges_children_fix F -edges_childarea 0.65 -edges_boxarea 0.9 -tessedit_resegment_from_boxes T -tessedit_train_from_boxes T -#textord_repeat_extraction F -textord_no_rejects T diff --git a/tessdata/configs/digits b/tessdata/configs/digits deleted file mode 100644 index 6a329f8929..0000000000 --- a/tessdata/configs/digits +++ /dev/null @@ -1 +0,0 @@ -tessedit_char_whitelist 0123456789-. diff --git a/tessdata/configs/get.images b/tessdata/configs/get.images deleted file mode 100644 index 7d00b613ff..0000000000 --- a/tessdata/configs/get.images +++ /dev/null @@ -1 +0,0 @@ -tessedit_write_images T diff --git a/tessdata/configs/hocr b/tessdata/configs/hocr deleted file mode 100644 index 5ab372eaf8..0000000000 --- a/tessdata/configs/hocr +++ /dev/null @@ -1,2 +0,0 @@ -tessedit_create_hocr 1 -hocr_font_info 0 diff --git a/tessdata/configs/inter b/tessdata/configs/inter deleted file mode 100644 index 252f1a171a..0000000000 --- a/tessdata/configs/inter +++ /dev/null @@ -1,2 +0,0 @@ -interactive_display_mode T -tessedit_display_outwords T diff --git a/tessdata/configs/kannada b/tessdata/configs/kannada deleted file mode 100644 index c6ac105788..0000000000 --- a/tessdata/configs/kannada +++ /dev/null @@ -1,4 +0,0 @@ -textord_skewsmooth_offset 8 -textord_skewsmooth_offset2 8 -textord_merge_desc 0.5 -textord_no_rejects 1 diff --git a/tessdata/configs/linebox b/tessdata/configs/linebox deleted file mode 100644 index bd9c114df6..0000000000 --- a/tessdata/configs/linebox +++ /dev/null @@ -1,2 +0,0 @@ -tessedit_resegment_from_line_boxes 1 -tessedit_make_boxes_from_boxes 1 diff --git a/tessdata/configs/logfile b/tessdata/configs/logfile deleted file mode 100644 index a160f9be27..0000000000 --- a/tessdata/configs/logfile +++ /dev/null @@ -1 +0,0 @@ -debug_file tesseract.log diff --git a/tessdata/configs/lstm.train b/tessdata/configs/lstm.train deleted file mode 100644 index 9fa52007b8..0000000000 --- a/tessdata/configs/lstm.train +++ /dev/null @@ -1,12 +0,0 @@ -disable_character_fragments T -file_type .bl -textord_fast_pitch_test T -tessedit_zero_rejection T -tessedit_minimal_rejection F -tessedit_write_rep_codes F -il1_adaption_test 1 -edges_children_fix F -edges_childarea 0.65 -edges_boxarea 0.9 -tessedit_train_line_recognizer T -textord_no_rejects T diff --git a/tessdata/configs/lstmbox b/tessdata/configs/lstmbox deleted file mode 100644 index a6f2cedc50..0000000000 --- a/tessdata/configs/lstmbox +++ /dev/null @@ -1 +0,0 @@ -tessedit_create_lstmbox 1 diff --git a/tessdata/configs/lstmdebug b/tessdata/configs/lstmdebug deleted file mode 100644 index 3fa3dee71a..0000000000 --- a/tessdata/configs/lstmdebug +++ /dev/null @@ -1,4 +0,0 @@ -stopper_debug_level 1 -classify_debug_level 1 -segsearch_debug_level 1 -language_model_debug_level 3 diff --git a/tessdata/configs/makebox b/tessdata/configs/makebox deleted file mode 100644 index 3d90ac26f9..0000000000 --- a/tessdata/configs/makebox +++ /dev/null @@ -1 +0,0 @@ -tessedit_create_boxfile 1 diff --git a/tessdata/configs/pdf b/tessdata/configs/pdf deleted file mode 100644 index 59645d71ce..0000000000 --- a/tessdata/configs/pdf +++ /dev/null @@ -1 +0,0 @@ -tessedit_create_pdf 1 diff --git a/tessdata/configs/quiet b/tessdata/configs/quiet deleted file mode 100644 index 35b59a9d41..0000000000 --- a/tessdata/configs/quiet +++ /dev/null @@ -1 +0,0 @@ -debug_file /dev/null diff --git a/tessdata/configs/rebox b/tessdata/configs/rebox deleted file mode 100644 index f8342b4c2c..0000000000 --- a/tessdata/configs/rebox +++ /dev/null @@ -1,2 +0,0 @@ -tessedit_resegment_from_boxes 1 -tessedit_make_boxes_from_boxes 1 diff --git a/tessdata/configs/strokewidth b/tessdata/configs/strokewidth deleted file mode 100644 index e95b59263d..0000000000 --- a/tessdata/configs/strokewidth +++ /dev/null @@ -1,12 +0,0 @@ -textord_show_blobs 0 -textord_debug_tabfind 3 -textord_tabfind_show_partitions 1 -textord_tabfind_show_initial_partitions 1 -textord_tabfind_show_columns 1 -textord_tabfind_show_blocks 1 -textord_tabfind_show_initialtabs 1 -textord_tabfind_show_finaltabs 1 -textord_tabfind_show_strokewidths 1 -textord_tabfind_show_vlines 0 -textord_tabfind_show_images 1 -tessedit_dump_pageseg_images 0 diff --git a/tessdata/configs/tsv b/tessdata/configs/tsv deleted file mode 100644 index dc52478177..0000000000 --- a/tessdata/configs/tsv +++ /dev/null @@ -1 +0,0 @@ -tessedit_create_tsv 1 diff --git a/tessdata/configs/txt b/tessdata/configs/txt deleted file mode 100644 index 5046f0b045..0000000000 --- a/tessdata/configs/txt +++ /dev/null @@ -1,3 +0,0 @@ -# This config file should be used with other cofig files which creates renderers. -# usage example: tesseract eurotext.tif eurotext txt hocr pdf -tessedit_create_txt 1 diff --git a/tessdata/configs/unlv b/tessdata/configs/unlv deleted file mode 100644 index d2e22f5b93..0000000000 --- a/tessdata/configs/unlv +++ /dev/null @@ -1,2 +0,0 @@ -tessedit_write_unlv 1 -unlv_tilde_crunching T diff --git a/tessdata/configs/wordstrbox b/tessdata/configs/wordstrbox deleted file mode 100644 index 38cd41cd60..0000000000 --- a/tessdata/configs/wordstrbox +++ /dev/null @@ -1 +0,0 @@ -tessedit_create_wordstrbox 1 diff --git a/tessdata/eng.user-patterns b/tessdata/eng.user-patterns deleted file mode 100644 index 5daba44df8..0000000000 --- a/tessdata/eng.user-patterns +++ /dev/null @@ -1,2 +0,0 @@ -1-\d\d\d-GOOG-411 -www.\n\\\*.com diff --git a/tessdata/eng.user-words b/tessdata/eng.user-words deleted file mode 100644 index e0c5a63021..0000000000 --- a/tessdata/eng.user-words +++ /dev/null @@ -1,5 +0,0 @@ -the -quick -brown -fox -jumped diff --git a/tessdata/pdf.ttf b/tessdata/pdf.ttf deleted file mode 100644 index d1472b20ef..0000000000 Binary files a/tessdata/pdf.ttf and /dev/null differ diff --git a/tessdata/tessconfigs/Makefile.am b/tessdata/tessconfigs/Makefile.am deleted file mode 100644 index 38a57be3d0..0000000000 --- a/tessdata/tessconfigs/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -datadir = @datadir@/tessdata/tessconfigs -data_DATA = batch batch.nochop nobatch matdemo segdemo msdemo -EXTRA_DIST = batch batch.nochop nobatch matdemo segdemo msdemo diff --git a/tessdata/tessconfigs/batch b/tessdata/tessconfigs/batch deleted file mode 100644 index a681e4a443..0000000000 --- a/tessdata/tessconfigs/batch +++ /dev/null @@ -1 +0,0 @@ -# No content needed as all defaults are correct. diff --git a/tessdata/tessconfigs/batch.nochop b/tessdata/tessconfigs/batch.nochop deleted file mode 100644 index ebaab9438e..0000000000 --- a/tessdata/tessconfigs/batch.nochop +++ /dev/null @@ -1,2 +0,0 @@ -chop_enable 0 -wordrec_enable_assoc 0 diff --git a/tessdata/tessconfigs/matdemo b/tessdata/tessconfigs/matdemo deleted file mode 100644 index c34567be75..0000000000 --- a/tessdata/tessconfigs/matdemo +++ /dev/null @@ -1,7 +0,0 @@ -################################################# -# Adaptive Matcher Using PreAdapted Templates -################################################# - -classify_enable_adaptive_debugger 1 -matcher_debug_flags 6 -matcher_debug_level 1 diff --git a/tessdata/tessconfigs/msdemo b/tessdata/tessconfigs/msdemo deleted file mode 100644 index a1af21fe61..0000000000 --- a/tessdata/tessconfigs/msdemo +++ /dev/null @@ -1,13 +0,0 @@ -################################################# -# Adaptive Matcher Using PreAdapted Templates -################################################# - -classify_enable_adaptive_debugger 1 -matcher_debug_flags 6 -matcher_debug_level 1 - -wordrec_display_splits 0 -wordrec_display_all_words 1 -wordrec_display_all_blobs 1 -wordrec_display_segmentations 2 -classify_debug_level 1 diff --git a/tessdata/tessconfigs/nobatch b/tessdata/tessconfigs/nobatch deleted file mode 100644 index 8b13789179..0000000000 --- a/tessdata/tessconfigs/nobatch +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tessdata/tessconfigs/segdemo b/tessdata/tessconfigs/segdemo deleted file mode 100644 index d7d90ae690..0000000000 --- a/tessdata/tessconfigs/segdemo +++ /dev/null @@ -1,10 +0,0 @@ -################################################# -# Adaptive Matcher Using PreAdapted Templates -################################################# - -wordrec_display_splits 0 -wordrec_display_all_words 1 -wordrec_display_all_blobs 1 -wordrec_display_segmentations 2 -classify_debug_level 1 -stopper_debug_level 1