Skip to content

Commit

Permalink
Merge pull request #106 from ArneBinder/drugprot
Browse files Browse the repository at this point in the history
add DrugProt dataset
  • Loading branch information
ArneBinder authored Apr 5, 2024
2 parents 78da1b9 + f2f2b38 commit c549768
Show file tree
Hide file tree
Showing 7 changed files with 809 additions and 5 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ venv.bak/
# Rope project settings
.ropeproject

# Vscode project settings
.vscode

# mkdocs documentation
/site

Expand Down
50 changes: 50 additions & 0 deletions dataset_builders/pie/drugprot/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# PIE Dataset Card for "DrugProt"

This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
[DrugProt Huggingface dataset loading script](https://huggingface.co/datasets/bigbio/drugprot).

## Data Schema

There are two versions of the dataset supported, `drugprot_source` and `drugprot_bigbio_kb`.

#### `DrugprotDocument` for `drugprot_source`

defines following fields:

- `text` (str)
- `id` (str, optional)
- `metadata` (dictionary, optional)
- `title` (str, optional)
- `abstract` (str, optional)

and the following annotation layers:

- `entities` (annotation type: `LabeledSpan`, target: `text`)
- `relations` (annotation type: `BinaryRelation`, target: `entities`)

#### `DrugprotBigbioDocument` for `drugprot_bigbio_kb`

defines following fields:

- `text` (str)
- `id` (str, optional)
- `metadata` (dictionary, optional)

and the following annotation layers:

- `passages` (annotation type: `LabeledSpan`, target: `text`)
- `entities` (annotation type: `LabeledSpan`, target: `text`)
- `relations` (annotation type: `BinaryRelation`, target: `entities`)

See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/annotations.py) for the annotation
type definitions.

## Document Converters

The dataset provides predefined document converters for the following target document types:

- `pie_modules.documents.TextDocumentWithLabeledSpansAndBinaryRelations` for `DrugprotDocument`
- `pie_modules.documents.TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions` for `DrugprotBigbioDocument`

See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/documents.py) for the document type
definitions.
154 changes: 154 additions & 0 deletions dataset_builders/pie/drugprot/drugprot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from dataclasses import dataclass
from typing import Any, Dict, Optional, Union

import datasets
from pie_modules.annotations import BinaryRelation, LabeledSpan
from pie_modules.documents import (
AnnotationLayer,
TextBasedDocument,
TextDocumentWithLabeledSpansAndBinaryRelations,
TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
annotation_field,
)

from pie_datasets import GeneratorBasedBuilder


@dataclass
class DrugprotDocument(TextBasedDocument):
title: Optional[str] = None
abstract: Optional[str] = None
entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")


@dataclass
class DrugprotBigbioDocument(TextBasedDocument):
passages: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text")
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities")


def example2drugprot(example: Dict[str, Any]) -> DrugprotDocument:
metadata = {"entity_ids": []}
id2labeled_span: Dict[str, LabeledSpan] = {}

document = DrugprotDocument(
text=example["text"],
title=example["title"],
abstract=example["abstract"],
id=example["document_id"],
metadata=metadata,
)
for span in example["entities"]:
labeled_span = LabeledSpan(
start=span["offset"][0],
end=span["offset"][1],
label=span["type"],
)
document.entities.append(labeled_span)
document.metadata["entity_ids"].append(span["id"])
id2labeled_span[span["id"]] = labeled_span
for relation in example["relations"]:
document.relations.append(
BinaryRelation(
head=id2labeled_span[relation["arg1_id"]],
tail=id2labeled_span[relation["arg2_id"]],
label=relation["type"],
)
)
return document


def example2drugprot_bigbio(example: Dict[str, Any]) -> DrugprotBigbioDocument:
text = " ".join([" ".join(passage["text"]) for passage in example["passages"]])
doc_id = example["document_id"]
metadata = {"entity_ids": []}
id2labeled_span: Dict[str, LabeledSpan] = {}

document = DrugprotBigbioDocument(
text=text,
id=doc_id,
metadata=metadata,
)
for passage in example["passages"]:
document.passages.append(
LabeledSpan(
start=passage["offsets"][0][0],
end=passage["offsets"][0][1],
label=passage["type"],
)
)
# We sort labels and relation to always have an deterministic order for testing purposes.
for span in example["entities"]:
labeled_span = LabeledSpan(
start=span["offsets"][0][0],
end=span["offsets"][0][1],
label=span["type"],
)
document.entities.append(labeled_span)
document.metadata["entity_ids"].append(span["id"])
id2labeled_span[span["id"]] = labeled_span
for relation in example["relations"]:
document.relations.append(
BinaryRelation(
head=id2labeled_span[relation["arg1_id"]],
tail=id2labeled_span[relation["arg2_id"]],
label=relation["type"],
)
)
return document


class Drugprot(GeneratorBasedBuilder):
DOCUMENT_TYPES = {
"drugprot_source": DrugprotDocument,
"drugprot_bigbio_kb": DrugprotBigbioDocument,
}

BASE_DATASET_PATH = "bigbio/drugprot"
BASE_DATASET_REVISION = "38ff03d68347aaf694e598c50cb164191f50f61c"

BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="drugprot_source",
version=datasets.Version("1.0.2"),
description="DrugProt source version",
),
datasets.BuilderConfig(
name="drugprot_bigbio_kb",
version=datasets.Version("1.0.0"),
description="DrugProt BigBio version",
),
]

@property
def document_converters(self):
if self.config.name == "drugprot_source":
return {
TextDocumentWithLabeledSpansAndBinaryRelations: {
"entities": "labeled_spans",
"relations": "binary_relations",
}
}
elif self.config.name == "drugprot_bigbio_kb":
return {
TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions: {
"passages": "labeled_partitions",
"entities": "labeled_spans",
"relations": "binary_relations",
}
}
else:
raise ValueError(f"Unknown dataset name: {self.config.name}")

def _generate_document(
self,
example: Dict[str, Any],
) -> Union[DrugprotDocument, DrugprotBigbioDocument]:
if self.config.name == "drugprot_source":
return example2drugprot(example)
elif self.config.name == "drugprot_bigbio_kb":
return example2drugprot_bigbio(example)
else:
raise ValueError(f"Unknown dataset config name: {self.config.name}")
1 change: 1 addition & 0 deletions dataset_builders/pie/drugprot/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pie-datasets>=0.9.0,<0.10.0
10 changes: 5 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ datasets = ">=2.14.0,<2.16.0"
pyarrow = "^13"

[tool.poetry.group.dev.dependencies]
pytorch-ie = {version = ">=0.30.2,<0.31.0", optional = true}
pie-modules = ">=0.10.8,<0.12.0"
torch = {version = "^2.1.0+cpu", source = "pytorch"}
pytest = "^7.4.2"
Expand Down
Loading

0 comments on commit c549768

Please sign in to comment.