-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #106 from ArneBinder/drugprot
add DrugProt dataset
- Loading branch information
Showing
7 changed files
with
809 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# PIE Dataset Card for "DrugProt" | ||
|
||
This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the | ||
[DrugProt Huggingface dataset loading script](https://huggingface.co/datasets/bigbio/drugprot). | ||
|
||
## Data Schema | ||
|
||
There are two versions of the dataset supported, `drugprot_source` and `drugprot_bigbio_kb`. | ||
|
||
#### `DrugprotDocument` for `drugprot_source` | ||
|
||
defines following fields: | ||
|
||
- `text` (str) | ||
- `id` (str, optional) | ||
- `metadata` (dictionary, optional) | ||
- `title` (str, optional) | ||
- `abstract` (str, optional) | ||
|
||
and the following annotation layers: | ||
|
||
- `entities` (annotation type: `LabeledSpan`, target: `text`) | ||
- `relations` (annotation type: `BinaryRelation`, target: `entities`) | ||
|
||
#### `DrugprotBigbioDocument` for `drugprot_bigbio_kb` | ||
|
||
defines following fields: | ||
|
||
- `text` (str) | ||
- `id` (str, optional) | ||
- `metadata` (dictionary, optional) | ||
|
||
and the following annotation layers: | ||
|
||
- `passages` (annotation type: `LabeledSpan`, target: `text`) | ||
- `entities` (annotation type: `LabeledSpan`, target: `text`) | ||
- `relations` (annotation type: `BinaryRelation`, target: `entities`) | ||
|
||
See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/annotations.py) for the annotation | ||
type definitions. | ||
|
||
## Document Converters | ||
|
||
The dataset provides predefined document converters for the following target document types: | ||
|
||
- `pie_modules.documents.TextDocumentWithLabeledSpansAndBinaryRelations` for `DrugprotDocument` | ||
- `pie_modules.documents.TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions` for `DrugprotBigbioDocument` | ||
|
||
See [here](https://github.com/ArneBinder/pie-modules/blob/main/src/pie_modules/documents.py) for the document type | ||
definitions. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
from dataclasses import dataclass | ||
from typing import Any, Dict, Optional, Union | ||
|
||
import datasets | ||
from pie_modules.annotations import BinaryRelation, LabeledSpan | ||
from pie_modules.documents import ( | ||
AnnotationLayer, | ||
TextBasedDocument, | ||
TextDocumentWithLabeledSpansAndBinaryRelations, | ||
TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions, | ||
annotation_field, | ||
) | ||
|
||
from pie_datasets import GeneratorBasedBuilder | ||
|
||
|
||
@dataclass | ||
class DrugprotDocument(TextBasedDocument): | ||
title: Optional[str] = None | ||
abstract: Optional[str] = None | ||
entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text") | ||
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities") | ||
|
||
|
||
@dataclass | ||
class DrugprotBigbioDocument(TextBasedDocument): | ||
passages: AnnotationLayer[LabeledSpan] = annotation_field(target="text") | ||
entities: AnnotationLayer[LabeledSpan] = annotation_field(target="text") | ||
relations: AnnotationLayer[BinaryRelation] = annotation_field(target="entities") | ||
|
||
|
||
def example2drugprot(example: Dict[str, Any]) -> DrugprotDocument: | ||
metadata = {"entity_ids": []} | ||
id2labeled_span: Dict[str, LabeledSpan] = {} | ||
|
||
document = DrugprotDocument( | ||
text=example["text"], | ||
title=example["title"], | ||
abstract=example["abstract"], | ||
id=example["document_id"], | ||
metadata=metadata, | ||
) | ||
for span in example["entities"]: | ||
labeled_span = LabeledSpan( | ||
start=span["offset"][0], | ||
end=span["offset"][1], | ||
label=span["type"], | ||
) | ||
document.entities.append(labeled_span) | ||
document.metadata["entity_ids"].append(span["id"]) | ||
id2labeled_span[span["id"]] = labeled_span | ||
for relation in example["relations"]: | ||
document.relations.append( | ||
BinaryRelation( | ||
head=id2labeled_span[relation["arg1_id"]], | ||
tail=id2labeled_span[relation["arg2_id"]], | ||
label=relation["type"], | ||
) | ||
) | ||
return document | ||
|
||
|
||
def example2drugprot_bigbio(example: Dict[str, Any]) -> DrugprotBigbioDocument: | ||
text = " ".join([" ".join(passage["text"]) for passage in example["passages"]]) | ||
doc_id = example["document_id"] | ||
metadata = {"entity_ids": []} | ||
id2labeled_span: Dict[str, LabeledSpan] = {} | ||
|
||
document = DrugprotBigbioDocument( | ||
text=text, | ||
id=doc_id, | ||
metadata=metadata, | ||
) | ||
for passage in example["passages"]: | ||
document.passages.append( | ||
LabeledSpan( | ||
start=passage["offsets"][0][0], | ||
end=passage["offsets"][0][1], | ||
label=passage["type"], | ||
) | ||
) | ||
# We sort labels and relation to always have an deterministic order for testing purposes. | ||
for span in example["entities"]: | ||
labeled_span = LabeledSpan( | ||
start=span["offsets"][0][0], | ||
end=span["offsets"][0][1], | ||
label=span["type"], | ||
) | ||
document.entities.append(labeled_span) | ||
document.metadata["entity_ids"].append(span["id"]) | ||
id2labeled_span[span["id"]] = labeled_span | ||
for relation in example["relations"]: | ||
document.relations.append( | ||
BinaryRelation( | ||
head=id2labeled_span[relation["arg1_id"]], | ||
tail=id2labeled_span[relation["arg2_id"]], | ||
label=relation["type"], | ||
) | ||
) | ||
return document | ||
|
||
|
||
class Drugprot(GeneratorBasedBuilder): | ||
DOCUMENT_TYPES = { | ||
"drugprot_source": DrugprotDocument, | ||
"drugprot_bigbio_kb": DrugprotBigbioDocument, | ||
} | ||
|
||
BASE_DATASET_PATH = "bigbio/drugprot" | ||
BASE_DATASET_REVISION = "38ff03d68347aaf694e598c50cb164191f50f61c" | ||
|
||
BUILDER_CONFIGS = [ | ||
datasets.BuilderConfig( | ||
name="drugprot_source", | ||
version=datasets.Version("1.0.2"), | ||
description="DrugProt source version", | ||
), | ||
datasets.BuilderConfig( | ||
name="drugprot_bigbio_kb", | ||
version=datasets.Version("1.0.0"), | ||
description="DrugProt BigBio version", | ||
), | ||
] | ||
|
||
@property | ||
def document_converters(self): | ||
if self.config.name == "drugprot_source": | ||
return { | ||
TextDocumentWithLabeledSpansAndBinaryRelations: { | ||
"entities": "labeled_spans", | ||
"relations": "binary_relations", | ||
} | ||
} | ||
elif self.config.name == "drugprot_bigbio_kb": | ||
return { | ||
TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions: { | ||
"passages": "labeled_partitions", | ||
"entities": "labeled_spans", | ||
"relations": "binary_relations", | ||
} | ||
} | ||
else: | ||
raise ValueError(f"Unknown dataset name: {self.config.name}") | ||
|
||
def _generate_document( | ||
self, | ||
example: Dict[str, Any], | ||
) -> Union[DrugprotDocument, DrugprotBigbioDocument]: | ||
if self.config.name == "drugprot_source": | ||
return example2drugprot(example) | ||
elif self.config.name == "drugprot_bigbio_kb": | ||
return example2drugprot_bigbio(example) | ||
else: | ||
raise ValueError(f"Unknown dataset config name: {self.config.name}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pie-datasets>=0.9.0,<0.10.0 |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.