Merge pull request #22 from ArneBinder/tokenization_dont_show_missed_…

…partitions tokenization: don't regard missed partitions
ArneBinder · Dec 22, 2023 · 8522938 · 8522938
2 parents e85a98b + 1700c21
commit 8522938
Showing 1 changed file with 8 additions and 5 deletions.
diff --git a/src/pie_modules/document/processing/tokenization.py b/src/pie_modules/document/processing/tokenization.py
@@ -343,11 +343,14 @@ def tokenize_document(
     missed_annotations = defaultdict(set)
     if strict_span_conversion or verbose:
         for annotation_field in doc.annotation_fields():
-            current_missed_annotations = set(doc[annotation_field.name]) - set(
-                added_annotations[annotation_field.name]
-            )
-            if len(current_missed_annotations) > 0:
-                missed_annotations[annotation_field.name] = current_missed_annotations
+            # do not check the partition layer because the partitions are not required later on
+            # and entries get quite probably removed when windowing is applied, so this just pollutes the logs
+            if annotation_field.name != partition_layer:
+                current_missed_annotations = set(doc[annotation_field.name]) - set(
+                    added_annotations[annotation_field.name]
+                )
+                if len(current_missed_annotations) > 0:
+                    missed_annotations[annotation_field.name] = current_missed_annotations
 
     if len(missed_annotations) > 0:
         missed_annotations_simplified = {k: str(v) for k, v in missed_annotations.items()}