Skip to content

Commit

Permalink
Merge pull request #22 from ArneBinder/tokenization_dont_show_missed_…
Browse files Browse the repository at this point in the history
…partitions

tokenization: don't regard missed partitions
  • Loading branch information
ArneBinder authored Dec 22, 2023
2 parents e85a98b + 1700c21 commit 8522938
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions src/pie_modules/document/processing/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,11 +343,14 @@ def tokenize_document(
missed_annotations = defaultdict(set)
if strict_span_conversion or verbose:
for annotation_field in doc.annotation_fields():
current_missed_annotations = set(doc[annotation_field.name]) - set(
added_annotations[annotation_field.name]
)
if len(current_missed_annotations) > 0:
missed_annotations[annotation_field.name] = current_missed_annotations
# do not check the partition layer because the partitions are not required later on
# and entries get quite probably removed when windowing is applied, so this just pollutes the logs
if annotation_field.name != partition_layer:
current_missed_annotations = set(doc[annotation_field.name]) - set(
added_annotations[annotation_field.name]
)
if len(current_missed_annotations) > 0:
missed_annotations[annotation_field.name] = current_missed_annotations

if len(missed_annotations) > 0:
missed_annotations_simplified = {k: str(v) for k, v in missed_annotations.items()}
Expand Down

0 comments on commit 8522938

Please sign in to comment.