Merge pull request #106 from ArneBinder/drugprot

add DrugProt dataset
ArneBinder · Apr 5, 2024 · c549768 · c549768
2 parents 78da1b9 + f2f2b38
commit c549768
Show file tree

Hide file tree

Showing 7 changed files with 809 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -117,6 +117,9 @@ venv.bak/
 # Rope project settings
 .ropeproject
 
+# Vscode project settings
+.vscode
+
 # mkdocs documentation
 /site
 

diff --git a/dataset_builders/pie/drugprot/README.md b/dataset_builders/pie/drugprot/README.md
@@ -0,0 +1,50 @@
+# PIE Dataset Card for "DrugProt"
+
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
+[DrugProt Huggingface dataset loading script](https://huggingface.co/datasets/bigbio/drugprot).
+
+## Data Schema
+
+There are two versions of the dataset supported, `drugprot_source` and `drugprot_bigbio_kb`.
+
+#### `DrugprotDocument` for `drugprot_source`
+
+defines following fields:
+
+- `text` (str)
+- `id` (str, optional)
+- `metadata` (dictionary, optional)
+- `title` (str, optional)
+- `abstract` (str, optional)
+
+and the following annotation layers:
+
+- `entities` (annotation type: `LabeledSpan`, target: `text`)
+- `relations` (annotation type: `BinaryRelation`, target: `entities`)
+
+#### `DrugprotBigbioDocument` for `drugprot_bigbio_kb`
+
+defines following fields:
+
+- `text` (str)
+- `id` (str, optional)
+- `metadata` (dictionary, optional)
+
+and the following annotation layers:
+
+- `passages` (annotation type: `LabeledSpan`, target: `text`)
+- `entities` (annotation type: `LabeledSpan`, target: `text`)
+- `relations` (annotation type: `BinaryRelation`, target: `entities`)
+
+See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/annotations.py) for the annotation
+type definitions.
+
+## Document Converters
+
+The dataset provides predefined document converters for the following target document types:
+
+- `pie_modules.documents.TextDocumentWithLabeledSpansAndBinaryRelations` for `DrugprotDocument`
+- `pie_modules.documents.TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions` for `DrugprotBigbioDocument`
+
+See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/documents.py) for the document type
+definitions.
diff --git a/dataset_builders/pie/drugprot/drugprot.py b/dataset_builders/pie/drugprot/drugprot.py
@@ -0,0 +1,154 @@
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Union
+
+import datasets
+from pie_modules.annotations import BinaryRelation, LabeledSpan
+from pie_modules.documents import (
+    AnnotationLayer,
+    TextBasedDocument,
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+    annotation_field,
+)
+
+from pie_datasets import GeneratorBasedBuilder
+
+
+@dataclass
+class DrugprotDocument(TextBasedDocument):
+    title: Optional[str] = None
+    abstract: Optional[str] = None
+    entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
+    relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")
+
+
+@dataclass
+class DrugprotBigbioDocument(TextBasedDocument):
+    passages: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
+    entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
+    relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")
+
+
+def example2drugprot(example: Dict[str, Any]) -> DrugprotDocument:
+    metadata = {"entity_ids": []}
+    id2labeled_span: Dict[str, LabeledSpan] = {}
+
+    document = DrugprotDocument(
+        text=example["text"],
+        title=example["title"],
+        abstract=example["abstract"],
+        id=example["document_id"],
+        metadata=metadata,
+    )
+    for span in example["entities"]:
+        labeled_span = LabeledSpan(
+            start=span["offset"][0],
+            end=span["offset"][1],
+            label=span["type"],
+        )
+        document.entities.append(labeled_span)
+        document.metadata["entity_ids"].append(span["id"])
+        id2labeled_span[span["id"]] = labeled_span
+    for relation in example["relations"]:
+        document.relations.append(
+            BinaryRelation(
+                head=id2labeled_span[relation["arg1_id"]],
+                tail=id2labeled_span[relation["arg2_id"]],
+                label=relation["type"],
+            )
+        )
+    return document
+
+
+def example2drugprot_bigbio(example: Dict[str, Any]) -> DrugprotBigbioDocument:
+    text = " ".join([" ".join(passage["text"]) for passage in example["passages"]])
+    doc_id = example["document_id"]
+    metadata = {"entity_ids": []}
+    id2labeled_span: Dict[str, LabeledSpan] = {}
+
+    document = DrugprotBigbioDocument(
+        text=text,
+        id=doc_id,
+        metadata=metadata,
+    )
+    for passage in example["passages"]:
+        document.passages.append(
+            LabeledSpan(
+                start=passage["offsets"][0][0],
+                end=passage["offsets"][0][1],
+                label=passage["type"],
+            )
+        )
+    # We sort labels and relation to always have an deterministic order for testing purposes.
+    for span in example["entities"]:
+        labeled_span = LabeledSpan(
+            start=span["offsets"][0][0],
+            end=span["offsets"][0][1],
+            label=span["type"],
+        )
+        document.entities.append(labeled_span)
+        document.metadata["entity_ids"].append(span["id"])
+        id2labeled_span[span["id"]] = labeled_span
+    for relation in example["relations"]:
+        document.relations.append(
+            BinaryRelation(
+                head=id2labeled_span[relation["arg1_id"]],
+                tail=id2labeled_span[relation["arg2_id"]],
+                label=relation["type"],
+            )
+        )
+    return document
+
+
+class Drugprot(GeneratorBasedBuilder):
+    DOCUMENT_TYPES = {
+        "drugprot_source": DrugprotDocument,
+        "drugprot_bigbio_kb": DrugprotBigbioDocument,
+    }
+
+    BASE_DATASET_PATH = "bigbio/drugprot"
+    BASE_DATASET_REVISION = "38ff03d68347aaf694e598c50cb164191f50f61c"
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="drugprot_source",
+            version=datasets.Version("1.0.2"),
+            description="DrugProt source version",
+        ),
+        datasets.BuilderConfig(
+            name="drugprot_bigbio_kb",
+            version=datasets.Version("1.0.0"),
+            description="DrugProt BigBio version",
+        ),
+    ]
+
+    @property
+    def document_converters(self):
+        if self.config.name == "drugprot_source":
+            return {
+                TextDocumentWithLabeledSpansAndBinaryRelations: {
+                    "entities": "labeled_spans",
+                    "relations": "binary_relations",
+                }
+            }
+        elif self.config.name == "drugprot_bigbio_kb":
+            return {
+                TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions: {
+                    "passages": "labeled_partitions",
+                    "entities": "labeled_spans",
+                    "relations": "binary_relations",
+                }
+            }
+        else:
+            raise ValueError(f"Unknown dataset name: {self.config.name}")
+
+    def _generate_document(
+        self,
+        example: Dict[str, Any],
+    ) -> Union[DrugprotDocument, DrugprotBigbioDocument]:
+        if self.config.name == "drugprot_source":
+            return example2drugprot(example)
+        elif self.config.name == "drugprot_bigbio_kb":
+            return example2drugprot_bigbio(example)
+        else:
+            raise ValueError(f"Unknown dataset config name: {self.config.name}")
diff --git a/dataset_builders/pie/drugprot/requirements.txt b/dataset_builders/pie/drugprot/requirements.txt
@@ -0,0 +1 @@
+pie-datasets>=0.9.0,<0.10.0
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ datasets = ">=2.14.0,<2.16.0"
 pyarrow = "^13"
 
 [tool.poetry.group.dev.dependencies]
+pytorch-ie = {version = ">=0.30.2,<0.31.0", optional = true}
 pie-modules = ">=0.10.8,<0.12.0"
 torch = {version = "^2.1.0+cpu", source = "pytorch"}
 pytest = "^7.4.2"