From ae8cc63aff71f827eb03dcbf765b5598950f2271 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 12 Dec 2019 00:36:24 +0100 Subject: [PATCH 1/3] ALTO renderer: move to v4, add Glyphs --- src/api/altorenderer.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 14b8fe11c2..0a5a91848e 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -45,6 +45,9 @@ static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level, if (level == RIL_WORD) { int wc = it->Confidence(RIL_WORD); alto_str << " WC=\"0." << wc << "\""; + } else if (level == RIL_SYMBOL) { + int gc = it->Confidence(RIL_SYMBOL); + alto_str << " GC=\"0." << gc << "\""; } else { alto_str << ">"; } @@ -56,11 +59,11 @@ static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level, bool TessAltoRenderer::BeginDocumentHandler() { AppendString( "\n" - "\n" + "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v4# " + "https://www.loc.gov/standards/alto/v4/alto-4-0.xsd\">\n" "\t\n" "\t\tpixel\n" "\t\t\n" @@ -127,7 +130,7 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) return nullptr; - int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0; + int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0, scnt = 0; if (input_file_ == nullptr) SetInputName(nullptr); @@ -187,7 +190,7 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { alto_str << "\t\t\t\t\t\t\tGetUTF8Text(RIL_WORD)).c_str() << "\">"; bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); @@ -198,15 +201,21 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); do { + alto_str << "\n\t\t\t\t\t\t\t\t grapheme( res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != 0) { alto_str << HOcrEscape(grapheme.get()).c_str(); } + alto_str << "\"/>"; res_it->Next(RIL_SYMBOL); + + scnt++; } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - alto_str << "\"/>"; + alto_str << "\n\t\t\t\t\t\t\t"; wcnt++; From fe17864d51e4202241b1d27f5d8ae6b3d79fe6c6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Dec 2019 09:51:59 +0100 Subject: [PATCH 2/3] ALTO renderer: indent SP, add Variant --- src/api/altorenderer.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 0a5a91848e..346e818e2e 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -209,7 +209,18 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { if (grapheme && grapheme[0] != 0) { alto_str << HOcrEscape(grapheme.get()).c_str(); } - alto_str << "\"/>"; + alto_str << "\">"; + ChoiceIterator choice_it(*res_it); + do { + int vc = choice_it.Confidence(); + alto_str << "\n\t\t\t\t\t\t\t\t\t"; + } while (choice_it.Next()); + alto_str << "\n\t\t\t\t\t\t\t\t"; res_it->Next(RIL_SYMBOL); scnt++; @@ -227,8 +238,12 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { int vpos = top; res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); int width = left - hpos; - alto_str << "\n"; + int height = bottom - top; + alto_str << "\n\t\t\t\t\t\t\t\n"; } if (last_word_in_tblock) { From 7d9437fcbf8816b0611a199e2bf89ac10283d08a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 17 Dec 2019 19:41:08 +0100 Subject: [PATCH 3/3] ALTO renderer: use proper BlockTypes - use TextBlock, Illustration, GraphicalElement (not just TextBlock), as appropriate for the internal block types - do not enter RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL and ChoiceIterator on anything other than TextBlocks - refactor loop to make it more readable --- src/api/altorenderer.cpp | 178 +++++++++++++++++++++------------------ 1 file changed, 95 insertions(+), 83 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 346e818e2e..9e5c06d285 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -164,97 +164,109 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { << " HEIGHT=\"" << rect_height_ << "\">\n"; ResultIterator* res_it = GetIterator(); - while (!res_it->Empty(RIL_BLOCK)) { - if (res_it->Empty(RIL_WORD)) { - res_it->Next(RIL_WORD); - continue; + for (; !res_it->Empty(RIL_BLOCK); res_it->Next(RIL_BLOCK)) { + alto_str << "\t\t\t\tBlockType()) { + case PT_FLOWING_TEXT: + case PT_HEADING_TEXT: + case PT_PULLOUT_TEXT: + case PT_CAPTION_TEXT: + case PT_VERTICAL_TEXT: + case PT_TABLE: // nothing special here + case PT_EQUATION: + case PT_INLINE_EQUATION: + block_type = "TextBlock"; + break; + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: + block_type = "Illustration"; + break; + case PT_HORZ_LINE: + case PT_VERT_LINE: + block_type = "GraphicalElement"; + break; + default: + block_type = "ComposedBlock"; } - if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - alto_str << "\t\t\t\tIsAtBeginningOf(RIL_PARA)) { - alto_str << "\t\t\t\t\tEmpty(RIL_PARA); res_it->Next(RIL_PARA)) { + alto_str << "\t\t\t\t\t<" << block_type << " ID=\"block_" << tcnt << "\""; AddBoxToAlto(res_it, RIL_PARA, alto_str); alto_str << "\n"; - } - - if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { - alto_str << "\t\t\t\t\t\tGetUTF8Text(RIL_WORD)).c_str() << "\">"; - - bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); - bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); - bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - - int left, top, right, bottom; - res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); - - do { - alto_str << "\n\t\t\t\t\t\t\t\t grapheme( - res_it->GetUTF8Text(RIL_SYMBOL)); - if (grapheme && grapheme[0] != 0) { - alto_str << HOcrEscape(grapheme.get()).c_str(); + if (strcmp(block_type, "TextBlock") == 0) { + for (; !res_it->Empty(RIL_TEXTLINE); res_it->Next(RIL_TEXTLINE)) { + alto_str << "\t\t\t\t\t\tEmpty(RIL_WORD); res_it->Next(RIL_WORD)) { + int left = 0, top = 0, right = 0, bottom = 0; + if (!res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + int hpos = right; + int vpos = top; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + int width = left - hpos; + int height = bottom - top; + alto_str << "\n\t\t\t\t\t\t\t\n"; + } + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + + alto_str << "\t\t\t\t\t\t\tGetUTF8Text(RIL_WORD)).c_str() << "\">"; + + for (; !res_it->Empty(RIL_SYMBOL); res_it->Next(RIL_SYMBOL)) { + alto_str << "\n\t\t\t\t\t\t\t\t grapheme(res_it->GetUTF8Text(RIL_SYMBOL)); + if (grapheme && grapheme[0] != 0) + alto_str << HOcrEscape(grapheme.get()).c_str(); + alto_str << "\">"; + + ChoiceIterator choice_it(*res_it); + do { + int vc = choice_it.Confidence(); + alto_str << "\n\t\t\t\t\t\t\t\t\t"; + } while (choice_it.Next()); + alto_str << "\n\t\t\t\t\t\t\t\t"; + scnt++; + if (res_it->IsAtFinalElement(RIL_WORD, RIL_SYMBOL)) + break; + } + alto_str << "\n\t\t\t\t\t\t\t"; + wcnt++; + if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) + break; + } + alto_str << "\n\t\t\t\t\t\t\n"; + lcnt++; + if (res_it->IsAtFinalElement(RIL_PARA, RIL_TEXTLINE)) + break; + } } - alto_str << "\">"; - ChoiceIterator choice_it(*res_it); - do { - int vc = choice_it.Confidence(); - alto_str << "\n\t\t\t\t\t\t\t\t\t"; - } while (choice_it.Next()); - alto_str << "\n\t\t\t\t\t\t\t\t"; - res_it->Next(RIL_SYMBOL); - - scnt++; - } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - - alto_str << "\n\t\t\t\t\t\t\t"; - - wcnt++; - - if (last_word_in_line) { - alto_str << "\n\t\t\t\t\t\t\n"; - lcnt++; - } else { - int hpos = right; - int vpos = top; - res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); - int width = left - hpos; - int height = bottom - top; - alto_str << "\n\t\t\t\t\t\t\t\n"; - } - - if (last_word_in_tblock) { - alto_str << "\t\t\t\t\t\n"; + alto_str << "\t\t\t\t\t\n"; tcnt++; + if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_PARA)) + break; } - - if (last_word_in_cblock) { - alto_str << "\t\t\t\t\n"; - bcnt++; - } + alto_str << "\t\t\t\t\n"; + bcnt++; } alto_str << "\t\t\t\n"