From 1700c21e6bd2a4766ee8b5554c66a82407fe9226 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Fri, 22 Dec 2023 15:16:57 +0100 Subject: [PATCH] tokenization: don't regard missed partitions (don't show them if verbose and don't break if strict_span_conversion) --- src/pie_modules/document/processing/tokenization.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/pie_modules/document/processing/tokenization.py b/src/pie_modules/document/processing/tokenization.py index 099d52582..172b11b52 100644 --- a/src/pie_modules/document/processing/tokenization.py +++ b/src/pie_modules/document/processing/tokenization.py @@ -343,11 +343,14 @@ def tokenize_document( missed_annotations = defaultdict(set) if strict_span_conversion or verbose: for annotation_field in doc.annotation_fields(): - current_missed_annotations = set(doc[annotation_field.name]) - set( - added_annotations[annotation_field.name] - ) - if len(current_missed_annotations) > 0: - missed_annotations[annotation_field.name] = current_missed_annotations + # do not check the partition layer because the partitions are not required later on + # and entries get quite probably removed when windowing is applied, so this just pollutes the logs + if annotation_field.name != partition_layer: + current_missed_annotations = set(doc[annotation_field.name]) - set( + added_annotations[annotation_field.name] + ) + if len(current_missed_annotations) > 0: + missed_annotations[annotation_field.name] = current_missed_annotations if len(missed_annotations) > 0: missed_annotations_simplified = {k: str(v) for k, v in missed_annotations.items()}