Merge pull request #83 from INL/feat/batch-processing

Add batch processing and report summaries
qurator-spk · May 26, 2023 · 35be58c · 35be58c
2 parents 6d3a8ce + 207804e
commit 35be58c
Show file tree

Hide file tree

Showing 17 changed files with 17,584 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,8 @@ dinglehopper is an OCR evaluation tool and reads
 [ALTO](https://github.com/altoxml),
 [PAGE](https://github.com/PRImA-Research-Lab/PAGE-XML) and text files.  It
 compares a ground truth (GT) document page with a OCR result page to compute
-metrics and a word/character differences report.
+metrics and a word/character differences report. It also supports batch processing by 
+generating, aggregating and summarizing multiple reports.
 
 [![Build Status](https://circleci.com/gh/qurator-spk/dinglehopper.svg?style=svg)](https://circleci.com/gh/qurator-spk/dinglehopper)
 
@@ -27,27 +28,31 @@ sudo pip install .
 Usage
 -----
 ~~~
-Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
+Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] [REPORTS_FOLDER]
 
   Compare the PAGE/ALTO/text document GT against the document OCR.
 
   dinglehopper detects if GT/OCR are ALTO or PAGE XML documents to extract
   their text and falls back to plain text if no ALTO or PAGE is detected.
 
   The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
-  In that case, use --no-metrics to disable the then meaningless metrics and
-  also change the color scheme from green/red to blue.
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
+  that case, use --no-metrics to disable the then meaningless metrics and also
+  change the color scheme from green/red to blue.
 
-  The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-  $REPORT_PREFIX defaults to "report". The reports include the character
-  error rate (CER) and the word error rate (WER).
+  The comparison report will be written to
+  $REPORTS_FOLDER/$REPORT_PREFIX.{html,json}, where $REPORTS_FOLDER defaults
+  to the current working directory and $REPORT_PREFIX defaults to "report".
+  The reports include the character error rate (CER) and the word error rate
+  (WER).
 
   By default, the text of PAGE files is extracted on 'region' level. You may
   use "--textequiv-level line" to extract from the level of TextLine tags.
 
 Options:
   --metrics / --no-metrics  Enable/disable metrics and green/red
+  --differences BOOLEAN     Enable reporting character and word level
+                            differences
   --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
   --progress                Show progress bar
   --help                    Show this message and exit.
@@ -61,6 +66,43 @@ This generates `report.html` and `report.json`.
 
 ![dinglehopper displaying metrics and character differences](.screenshots/dinglehopper.png?raw=true)
 
+Batch comparison between folders of GT and OCR files can be done by simply providing 
+folders:
+~~~
+dinglehopper gt/ ocr/ report output_folder/
+~~~
+This assumes that you have files with the same name in both folders, e.g. 
+`gt/00000001.page.xml` and `ocr/00000001.alto.xml`.
+
+The example generates reports for each set of files, with the prefix `report`, in the 
+(automatically created) folder `output_folder/`.
+
+By default, the JSON report does not contain the character and word differences, only 
+the calculated metrics. If you want to include the differences, use the 
+`--differences` flag:
+
+~~~
+dinglehopper gt/ ocr/ report output_folder/ --differences
+~~~
+
+### dinglehopper-summarize
+A set of (JSON) reports can be summarized into a single set of 
+reports. This is useful after having generated reports in batch.
+Example:
+~~~
+dinglehopper-summarize output_folder/
+~~~
+This generates `summary.html` and `summary.json` in the same `output_folder`.
+
+If you are summarizing many reports and have used the `--differences` flag while
+generating them, it may be useful to limit the number of differences reported by using
+the `--occurences-threshold` parameter. This will reduce the size of the generated HTML 
+report, making it easier to open and navigate. Note that the JSON report will still
+contain all differences. Example:
+~~~
+dinglehopper-summarize output_folder/ --occurences-threshold 10
+~~~
+
 ### dinglehopper-line-dirs
 You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.txt`)
 with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate

diff --git a/dinglehopper/cli.py b/dinglehopper/cli.py
@@ -1,20 +1,21 @@
 import os
+from collections import Counter
 
 import click
 from jinja2 import Environment, FileSystemLoader
 from markupsafe import escape
 from uniseg.graphemecluster import grapheme_clusters
 from ocrd_utils import initLogging
 
-from .character_error_rate import character_error_rate_n
-from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align
-from .extracted_text import ExtractedText
-from .ocr_files import extract
-from .config import Config
+from dinglehopper.character_error_rate import character_error_rate_n
+from dinglehopper.word_error_rate import word_error_rate_n, words_normalized
+from dinglehopper.align import seq_align
+from dinglehopper.extracted_text import ExtractedText
+from dinglehopper.ocr_files import extract
+from dinglehopper.config import Config
 
 
-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, differences=False):
     gtx = ""
     ocrx = ""
 
@@ -54,6 +55,8 @@ def format_thing(t, css_classes=None, id_=None):
 
     g_pos = 0
     o_pos = 0
+    found_differences = []
+
     for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
         css_classes = None
         gt_id = None
@@ -66,6 +69,9 @@ def format_thing(t, css_classes=None, id_=None):
                 # Deletions and inserts only produce one id + None, UI must
                 # support this, i.e. display for the one id produced
 
+            if differences:
+                found_differences.append(f'{g} :: {o}')
+
         gtx += joiner + format_thing(g, css_classes, gt_id)
         ocrx += joiner + format_thing(o, css_classes, ocr_id)
 
@@ -74,14 +80,16 @@ def format_thing(t, css_classes=None, id_=None):
         if o is not None:
             o_pos += len(o)
 
+    found_differences = dict(Counter(elem for elem in found_differences))
+
     return """
         <div class="row">
            <div class="col-md-6 gt">{}</div>
            <div class="col-md-6 ocr">{}</div>
         </div>
         """.format(
         gtx, ocrx
-    )
+    ), found_differences
 
 
 def json_float(value):
@@ -97,7 +105,8 @@ def json_float(value):
         return str(value)
 
 
-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
+def process(gt, ocr, report_prefix, reports_folder='.', *, metrics=True,
+            differences=False, textequiv_level="region"):
     """Check OCR result against GT.
 
     The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
@@ -110,14 +119,15 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
     cer, n_characters = character_error_rate_n(gt_text, ocr_text)
     wer, n_words = word_error_rate_n(gt_text, ocr_text)
 
-    char_diff_report = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·"
-    )
+    char_diff_report, diff_c = gen_diff_report(gt_text, ocr_text, css_prefix="c",
+                                               joiner="",
+                                               none="·", differences=differences)
 
     gt_words = words_normalized(gt_text)
     ocr_words = words_normalized(ocr_text)
-    word_diff_report = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+    word_diff_report, diff_w = gen_diff_report(
+        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯",
+        differences=differences
     )
 
     env = Environment(
@@ -129,7 +139,11 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
 
     for report_suffix in (".html", ".json"):
         template_fn = "report" + report_suffix + ".j2"
-        out_fn = report_prefix + report_suffix
+
+        if not os.path.isdir(reports_folder):
+            os.mkdir(reports_folder)
+
+        out_fn = os.path.join(reports_folder, report_prefix + report_suffix)
 
         template = env.get_template(template_fn)
         template.stream(
@@ -142,24 +156,51 @@ def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
             char_diff_report=char_diff_report,
             word_diff_report=word_diff_report,
             metrics=metrics,
+            differences=differences,
+            diff_c=diff_c,
+            diff_w=diff_w,
         ).dump(out_fn)
 
 
+def process_dir(gt, ocr, report_prefix, reports_folder, metrics, differences,
+                textequiv_level):
+    for gt_file in os.listdir(gt):
+        gt_file_path = os.path.join(gt, gt_file)
+        ocr_file_path = os.path.join(ocr, gt_file)
+
+        if os.path.isfile(gt_file_path) and os.path.isfile(ocr_file_path):
+            process(gt_file_path, ocr_file_path,
+                    f"{gt_file}-{report_prefix}",
+                    reports_folder=reports_folder,
+                    metrics=metrics,
+                    differences=differences,
+                    textequiv_level=textequiv_level)
+        else:
+            print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
+
+
 @click.command()
 @click.argument("gt", type=click.Path(exists=True))
 @click.argument("ocr", type=click.Path(exists=True))
 @click.argument("report_prefix", type=click.Path(), default="report")
+@click.argument("reports_folder", type=click.Path(), default=".")
 @click.option(
     "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
 )
+@click.option(
+    "--differences",
+    default=False,
+    help="Enable reporting character and word level differences"
+)
 @click.option(
     "--textequiv-level",
     default="region",
     help="PAGE TextEquiv level to extract text from",
     metavar="LEVEL",
 )
 @click.option("--progress", default=False, is_flag=True, help="Show progress bar")
-def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
+def main(gt, ocr, report_prefix, reports_folder, metrics, differences, textequiv_level,
+         progress):
     """
     Compare the PAGE/ALTO/text document GT against the document OCR.
 
@@ -171,7 +212,8 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
     that case, use --no-metrics to disable the then meaningless metrics and also
     change the color scheme from green/red to blue.
 
-    The comparison report will be written to $REPORT_PREFIX.{html,json}, where
+    The comparison report will be written to $REPORTS_FOLDER/$REPORT_PREFIX.{html,json},
+    where $REPORTS_FOLDER defaults to the current working directory and
     $REPORT_PREFIX defaults to "report". The reports include the character error
     rate (CER) and the word error rate (WER).
 
@@ -180,7 +222,17 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
     """
     initLogging()
     Config.progress = progress
-    process(gt, ocr, report_prefix, metrics=metrics, textequiv_level=textequiv_level)
+    if os.path.isdir(gt):
+        if not os.path.isdir(ocr):
+            raise click.BadParameter(
+                "OCR must be a directory if GT is a directory", param_hint="ocr"
+            )
+        else:
+            process_dir(gt, ocr, report_prefix, reports_folder, metrics,
+                        differences, textequiv_level)
+    else:
+        process(gt, ocr, report_prefix, reports_folder, metrics=metrics,
+                differences=differences, textequiv_level=textequiv_level)
 
 
 if __name__ == "__main__":

diff --git a/dinglehopper/cli_summarize.py b/dinglehopper/cli_summarize.py
@@ -0,0 +1,101 @@
+import json
+import os
+
+import click
+from ocrd_utils import initLogging
+from jinja2 import Environment, FileSystemLoader
+
+from dinglehopper.cli import json_float
+
+
+def process(reports_folder, occurrences_threshold=1):
+    cer_list = []
+    wer_list = []
+    cer_sum = 0
+    wer_sum = 0
+    diff_c = {}
+    diff_w = {}
+
+    for report in os.listdir(reports_folder):
+        if report.endswith(".json"):
+            with open(os.path.join(reports_folder, report), "r") as f:
+                report_data = json.load(f)
+
+                if "cer" not in report_data or "wer" not in report_data:
+                    click.echo(
+                        f"Skipping {report} because it does not contain CER and WER")
+                    continue
+
+                cer = report_data["cer"]
+                wer = report_data["wer"]
+                cer_list.append(cer)
+                wer_list.append(wer)
+                cer_sum += cer
+                wer_sum += wer
+
+                for key, value in report_data["differences"]["character_level"].items():
+                    diff_c[key] = diff_c.get(key, 0) + value
+                for key, value in report_data["differences"]["word_level"].items():
+                    diff_w[key] = diff_w.get(key, 0) + value
+
+    if len(cer_list) == 0:
+        click.echo(f"No reports found in folder '{os.path.abspath(reports_folder)}'")
+        return
+
+    cer_avg = cer_sum / len(cer_list)
+    wer_avg = wer_sum / len(wer_list)
+
+    print(f"Number of reports: {len(cer_list)}")
+    print(f"Average CER: {cer_avg}")
+    print(f"Average WER: {wer_avg}")
+    print(f"Sum of common mistakes: {cer_sum}")
+    print(f"Sum of common mistakes: {wer_sum}")
+
+    env = Environment(
+        loader=FileSystemLoader(
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
+        )
+    )
+    env.filters["json_float"] = json_float
+    for report_suffix in (".html", ".json"):
+        template_fn = "summary" + report_suffix + ".j2"
+
+        out_fn = os.path.join(reports_folder, 'summary' + report_suffix)
+        template = env.get_template(template_fn)
+        template.stream(
+            num_reports=len(cer_list),
+            cer_avg=cer_avg,
+            wer_avg=wer_avg,
+            diff_c=diff_c,
+            diff_w=diff_w,
+            occurrences_threshold=occurrences_threshold,
+        ).dump(out_fn)
+
+
+@click.command()
+@click.argument("reports_folder",
+                type=click.Path(exists=True),
+                default="./reports"
+                )
+@click.option("--occurrences-threshold",
+              type=int,
+              default=1,
+              help="Only show differences that occur at least this many times.")
+def main(reports_folder, occurrences_threshold):
+    """
+    Summarize the results from multiple reports generated earlier by dinglehopper.
+    It calculates the average CER and WER, as well as a sum of common mistakes.
+    Reports include lists of mistakes and their occurrences.
+
+    You may use a threshold to reduce the file size of the HTML report by only showing
+    mistakes whose number of occurrences is above the threshold. The JSON report will
+    always contain all mistakes.
+
+    All JSON files in the provided folder will be gathered and summarized.
+    """
+    initLogging()
+    process(reports_folder, occurrences_threshold)
+
+
+if __name__ == "__main__":
+    main()