diff --git a/dataset_builders/pie/abstrct/README.md b/dataset_builders/pie/abstrct/README.md index d0b752a4..45123b8a 100644 --- a/dataset_builders/pie/abstrct/README.md +++ b/dataset_builders/pie/abstrct/README.md @@ -46,9 +46,9 @@ assert isinstance(doc, builders.brat.BratDocumentWithMergedSpans) The dataset provides document converters for the following target document types: - `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations` - - `LabeledSpans`, converted from `BratDocument`'s `spans` + - `LabeledSpans`, converted from `BratDocumentWithMergedSpans`'s `spans` - labels: `MajorClaim`, `Claim`, `Premise` - - `BinraryRelations`, converted from `BratDocument`'s `relations` + - `BinraryRelations`, converted from `BratDocumentWithMergedSpans`'s `relations` - labels: `Support`, `Partial-Attack`, `Attack` See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type diff --git a/dataset_builders/pie/abstrct/abstrct.py b/dataset_builders/pie/abstrct/abstrct.py index 6dc12e42..045c2e23 100644 --- a/dataset_builders/pie/abstrct/abstrct.py +++ b/dataset_builders/pie/abstrct/abstrct.py @@ -27,7 +27,7 @@ class AbstRCT(BratBuilder): # we need to add None to the list of dataset variants to support the default dataset variant BASE_BUILDER_KWARGS_DICT = { dataset_variant: {"url": URL, "split_paths": SPLIT_PATHS} - for dataset_variant in ["default", "merge_fragmented_spans", None] + for dataset_variant in ["default", None] } DOCUMENT_CONVERTERS = { diff --git a/tests/dataset_builders/pie/test_abstrct.py b/tests/dataset_builders/pie/test_abstrct.py index 02036b8f..54b99bce 100644 --- a/tests/dataset_builders/pie/test_abstrct.py +++ b/tests/dataset_builders/pie/test_abstrct.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List import pytest from datasets import disable_caching @@ -71,12 +71,9 @@ def test_document(document, dataset_variant): assert document is not None assert document.id == "10561201" - # check the annotation - if dataset_variant == "default" or dataset_variant is None: - span_texts_labels_tuples = [(str(span), span.label) for span in document.spans] - - # check spans + # check the spans assert len(document.spans) == 7 + span_texts_labels_tuples = [(str(span), span.label) for span in document.spans] assert span_texts_labels_tuples[0] == ( "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " "in men with metastatic, hormone-resistant, prostate cancer.", @@ -138,7 +135,7 @@ def test_document(document, dataset_variant): @pytest.fixture(scope="module") def dataset_of_text_documents_with_labeled_spans_and_binary_relations( dataset, dataset_variant -) -> Optional[DatasetDict]: +) -> DatasetDict: if dataset_variant == "default" or dataset_variant is None: converted_dataset = dataset.to_document_type( TextDocumentWithLabeledSpansAndBinaryRelations @@ -151,98 +148,97 @@ def dataset_of_text_documents_with_labeled_spans_and_binary_relations( def test_dataset_of_text_documents_with_labeled_spans_and_binary_relations( dataset_of_text_documents_with_labeled_spans_and_binary_relations, ): - if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: - # get a document to check - converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] - # check that the conversion is correct and the data makes sense - assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) - - # check the entities - assert len(converted_doc.labeled_spans) == 7 - entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans] - assert entity_tuples[0] == ( - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men " - "with metastatic, hormone-resistant, prostate cancer.", - "MajorClaim", - ) - assert entity_tuples[1] == ( - "At 6 weeks, both groups showed improvement in several HQL domains,", - "Premise", - ) - assert entity_tuples[2] == ( - "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " - "prednisone-alone group.", - "Premise", - ) - assert entity_tuples[3] == ( - "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " - "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), " - "four functioning domains, and nine symptoms (.001 < P <. 01),", - "Premise", - ) - assert entity_tuples[4] == ( - "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " - "(.004 < P <.05).", - "Premise", - ) - assert entity_tuples[5] == ( - "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " - "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", - "Premise", - ) - assert entity_tuples[6] == ( - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - "Claim", - ) + # get a document to check + converted_doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] + # check that the conversion is correct and the data makes sense + assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) + + # check the entities + assert len(converted_doc.labeled_spans) == 7 + entity_tuples = [(str(ent), ent.label) for ent in converted_doc.labeled_spans] + assert entity_tuples[0] == ( + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in men " + "with metastatic, hormone-resistant, prostate cancer.", + "MajorClaim", + ) + assert entity_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Premise", + ) + assert entity_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Premise", + ) + assert entity_tuples[3] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), " + "four functioning domains, and nine symptoms (.001 < P <. 01),", + "Premise", + ) + assert entity_tuples[4] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Premise", + ) + assert entity_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Premise", + ) + assert entity_tuples[6] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Claim", + ) - # check the relations - assert len(converted_doc.binary_relations) == 6 - relation_tuples = [ - (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations - ] - assert relation_tuples[0] == ( - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - "Support", - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " - "in men with metastatic, hormone-resistant, prostate cancer.", - ) - assert relation_tuples[1] == ( - "At 6 weeks, both groups showed improvement in several HQL domains,", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[2] == ( - "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " - "prednisone-alone group.", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[3] == ( - "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " - "(.004 < P <.05).", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in " - "several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[4] == ( - "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " - "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " - "functioning domains, and nine symptoms (.001 < P <. 01),", - "Support", - "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " - "in several HQL domains and symptoms than treatment with prednisone alone.", - ) - assert relation_tuples[5] == ( - "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with improvements " - "in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", - "Support", - "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in " - "men with metastatic, hormone-resistant, prostate cancer.", - ) + # check the relations + assert len(converted_doc.binary_relations) == 6 + relation_tuples = [ + (str(rel.head), rel.label, str(rel.tail)) for rel in converted_doc.binary_relations + ] + assert relation_tuples[0] == ( + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain " + "in men with metastatic, hormone-resistant, prostate cancer.", + ) + assert relation_tuples[1] == ( + "At 6 weeks, both groups showed improvement in several HQL domains,", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[2] == ( + "only physical functioning and pain were better in the mitoxantrone-plus-prednisone group than in the " + "prednisone-alone group.", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[3] == ( + "the improvement (> 10 units on a scale of 0 to100) lasted longer than in the prednisone-alone group " + "(.004 < P <.05).", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement in " + "several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[4] == ( + "After 6 weeks, patients taking prednisone showed no improvement in HQL scores, whereas those taking " + "mitoxantrone plus prednisone showed significant improvements in global quality of life (P =.009), four " + "functioning domains, and nine symptoms (.001 < P <. 01),", + "Support", + "Treatment with mitoxantrone plus prednisone was associated with greater and longer-lasting improvement " + "in several HQL domains and symptoms than treatment with prednisone alone.", + ) + assert relation_tuples[5] == ( + "The addition of mitoxantrone to prednisone after failure of prednisone alone was associated with " + "improvements in pain, pain impact, pain relief, insomnia, and global quality of life (.001 < P <.003).", + "Support", + "A combination of mitoxantrone plus prednisone is preferable to prednisone alone for reduction of pain in " + "men with metastatic, hormone-resistant, prostate cancer.", + ) @pytest.fixture(scope="module") @@ -253,10 +249,7 @@ def tokenizer() -> PreTrainedTokenizer: @pytest.fixture(scope="module") def tokenized_documents_with_labeled_spans_and_binary_relations( dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer -) -> Optional[List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]]: - if dataset_of_text_documents_with_labeled_spans_and_binary_relations is None: - return None - +) -> List[TestTokenDocumentWithLabeledSpansAndBinaryRelations]: # get a document to check doc = dataset_of_text_documents_with_labeled_spans_and_binary_relations[SPLIT][0] # Note, that this is a list of documents, because the document may be split into chunks @@ -275,96 +268,94 @@ def tokenized_documents_with_labeled_spans_and_binary_relations( def test_tokenized_documents_with_labeled_spans_and_binary_relations( tokenized_documents_with_labeled_spans_and_binary_relations, ): - if tokenized_documents_with_labeled_spans_and_binary_relations is not None: - docs = tokenized_documents_with_labeled_spans_and_binary_relations - # check that the tokenization was fine - assert len(docs) == 1 - doc = docs[0] - assert len(doc.tokens) == 465 - assert len(doc.labeled_spans) == 7 - ent = doc.labeled_spans[0] - assert ( - str(ent) - == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', " - "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', " - "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')" - ) - ent = doc.labeled_spans[1] - assert ( - str(ent) - == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', " - "'domains', ',')" - ) - ent = doc.labeled_spans[2] - assert ( - str(ent) - == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', " - "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', " - "'##nis', '##one', '-', 'alone', 'group', '.')" - ) - ent = doc.labeled_spans[3] - assert ( - str(ent) - == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', " - "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', " - "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', " - "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', " - "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')" - ) - ent = doc.labeled_spans[4] - assert ( - str(ent) - == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', " - "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', " - "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')" - ) - ent = doc.labeled_spans[5] - assert ( - str(ent) - == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', " - "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', " - "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', " - "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')" - ) - ent = doc.labeled_spans[6] - assert ( - str(ent) - == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', " - "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', " - "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', " - "'alone', '.')" - ) + docs = tokenized_documents_with_labeled_spans_and_binary_relations + # check that the tokenization was fine + assert len(docs) == 1 + doc = docs[0] + assert len(doc.tokens) == 465 + assert len(doc.labeled_spans) == 7 + ent = doc.labeled_spans[0] + assert ( + str(ent) + == "('a', 'combination', 'of', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'is', " + "'prefer', '##able', 'to', 'pre', '##d', '##nis', '##one', 'alone', 'for', 'reduction', 'of', 'pain', 'in', " + "'men', 'with', 'meta', '##static', ',', 'hormone', '-', 'resistant', ',', 'prostate', 'cancer', '.')" + ) + ent = doc.labeled_spans[1] + assert ( + str(ent) + == "('at', '6', 'weeks', ',', 'both', 'groups', 'showed', 'improvement', 'in', 'several', 'hq', '##l', " + "'domains', ',')" + ) + ent = doc.labeled_spans[2] + assert ( + str(ent) + == "('only', 'physical', 'functioning', 'and', 'pain', 'were', 'better', 'in', 'the', 'mit', '##ox', '##ant', " + "'##rone', '-', 'plus', '-', 'pre', '##d', '##nis', '##one', 'group', 'than', 'in', 'the', 'pre', '##d', " + "'##nis', '##one', '-', 'alone', 'group', '.')" + ) + ent = doc.labeled_spans[3] + assert ( + str(ent) + == "('after', '6', 'weeks', ',', 'patients', 'taking', 'pre', '##d', '##nis', '##one', 'showed', 'no', " + "'improvement', 'in', 'hq', '##l', 'scores', ',', 'whereas', 'those', 'taking', 'mit', '##ox', '##ant', " + "'##rone', 'plus', 'pre', '##d', '##nis', '##one', 'showed', 'significant', 'improvements', 'in', 'global', " + "'quality', 'of', 'life', '(', 'p', '=', '.', '00', '##9', ')', ',', 'four', 'functioning', 'domains', ',', " + "'and', 'nine', 'symptoms', '(', '.', '001', '<', 'p', '<', '.', '01', ')', ',')" + ) + ent = doc.labeled_spans[4] + assert ( + str(ent) + == "('the', 'improvement', '(', '>', '10', 'units', 'on', 'a', 'scale', 'of', '0', 'to', '##100', ')', " + "'lasted', 'longer', 'than', 'in', 'the', 'pre', '##d', '##nis', '##one', '-', 'alone', 'group', '(', '.', " + "'00', '##4', '<', 'p', '<', '.', '05', ')', '.')" + ) + ent = doc.labeled_spans[5] + assert ( + str(ent) + == "('the', 'addition', 'of', 'mit', '##ox', '##ant', '##rone', 'to', 'pre', '##d', '##nis', '##one', " + "'after', 'failure', 'of', 'pre', '##d', '##nis', '##one', 'alone', 'was', 'associated', 'with', " + "'improvements', 'in', 'pain', ',', 'pain', 'impact', ',', 'pain', 'relief', ',', 'ins', '##om', '##nia', " + "',', 'and', 'global', 'quality', 'of', 'life', '(', '.', '001', '<', 'p', '<', '.', '00', '##3', ')', '.')" + ) + ent = doc.labeled_spans[6] + assert ( + str(ent) + == "('treatment', 'with', 'mit', '##ox', '##ant', '##rone', 'plus', 'pre', '##d', '##nis', '##one', 'was', " + "'associated', 'with', 'greater', 'and', 'longer', '-', 'lasting', 'improvement', 'in', 'several', " + "'hq', '##l', 'domains', 'and', 'symptoms', 'than', 'treatment', 'with', 'pre', '##d', '##nis', '##one', " + "'alone', '.')" + ) def test_tokenized_documents_with_entities_and_relations_all( dataset_of_text_documents_with_labeled_spans_and_binary_relations, tokenizer, dataset_variant ): - if dataset_of_text_documents_with_labeled_spans_and_binary_relations is not None: - for ( - split, - docs, - ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items(): - for doc in docs: - # Note, that this is a list of documents, because the document may be split into chunks - # if the input text is too long. - tokenized_docs = tokenize_document( - doc, - tokenizer=tokenizer, - return_overflowing_tokens=True, - result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, - strict_span_conversion=True, - verbose=True, - ) - # we just ensure that we get at least one tokenized document - assert tokenized_docs is not None - assert len(tokenized_docs) > 0 + for ( + split, + docs, + ) in dataset_of_text_documents_with_labeled_spans_and_binary_relations.items(): + for doc in docs: + # Note, that this is a list of documents, because the document may be split into chunks + # if the input text is too long. + tokenized_docs = tokenize_document( + doc, + tokenizer=tokenizer, + return_overflowing_tokens=True, + result_document_type=TestTokenDocumentWithLabeledSpansAndBinaryRelations, + strict_span_conversion=True, + verbose=True, + ) + # we just ensure that we get at least one tokenized document + assert tokenized_docs is not None + assert len(tokenized_docs) > 0 def test_document_converters(dataset_variant): builder = BUILDER_CLASS(config_name=dataset_variant) document_converters = builder.document_converters - if dataset_variant == "default": + if dataset_variant == "default" or dataset_variant is None: assert len(document_converters) == 1 assert set(document_converters) == { TextDocumentWithLabeledSpansAndBinaryRelations,